R基础

加载库

library(car)
## Loading required package: carData
library(tidyverse) # 加载tidyverse包
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych) # 加载psych包
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## The following object is masked from 'package:car':
## 
##     logit
library(ggmosaic)

指定图形中的中文字体

par(family = "STKaiti")
library(showtext)
## Loading required package: sysfonts
## Loading required package: showtextdb
showtext_auto()

数据管理

数据导入

library(readr) # 加载readr包
library(readxl) # 加载readxl包

# datafile <- read_excel(file.choose()) # 读取Excel文件
datafile <- read_excel("BMI.xlsx") # 读取Excel文件
# datafile <- read_csv("test.csv") # 读取Excel文件
datafile <- datafile %>% 
  rename(height = "身高", 
         weight = "体重", 
         gender = "性别", 
         attitude = "态度"
         ) # 对变量进行重命名

数据集基本信息

datafile %>% 
  head() # 查看数据集的前几行
## # A tibble: 6 × 6
##      id gender height weight attitude   BMI
##   <dbl>  <dbl>  <dbl>  <dbl>    <dbl> <dbl>
## 1     1      1    180     85        3  26.2
## 2     2      0    165     62        2  22.8
## 3     3      0    168     65        1  23.0
## 4     4      1    175     65        1  21.2
## 5     5      1    165     63        1  23.1
## 6     6      1    168     60        3  21.3
datafile %>% 
  str() # 查看数据集的结构
## tibble [14 × 6] (S3: tbl_df/tbl/data.frame)
##  $ id      : num [1:14] 1 2 3 4 5 6 7 8 9 10 ...
##  $ gender  : num [1:14] 1 0 0 1 1 1 1 1 1 0 ...
##  $ height  : num [1:14] 180 165 168 175 165 168 175 170 165 163 ...
##  $ weight  : num [1:14] 85 62 65 65 63 60 70 64 58 55 ...
##  $ attitude: num [1:14] 3 2 1 1 1 3 3 3 2 2 ...
##  $ BMI     : num [1:14] 26.2 22.8 23 21.2 23.1 ...
datafile %>% 
  dim() # 查看数据集的维度
## [1] 14  6
datafile %>% 
  nrow()  # 查看数据集的行数
## [1] 14
datafile %>% 
  ncol()  # 查看数据集的列数
## [1] 6
datafile %>% 
  attributes() # 查看数据集的属性
## $class
## [1] "tbl_df"     "tbl"        "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14
## 
## $names
## [1] "id"       "gender"   "height"   "weight"   "attitude" "BMI"
datafile %>% 
  class() # 查看数据集的类别
## [1] "tbl_df"     "tbl"        "data.frame"
datafile %>% 
  names() # 查看数据集的变量名
## [1] "id"       "gender"   "height"   "weight"   "attitude" "BMI"
datafile %>% 
  summary() # 查看数据集的描述性统计
##        id            gender        height          weight         attitude    
##  Min.   : 1.00   Min.   :0.0   Min.   :158.0   Min.   :45.00   Min.   :1.000  
##  1st Qu.: 4.25   1st Qu.:0.0   1st Qu.:163.5   1st Qu.:52.75   1st Qu.:1.000  
##  Median : 7.50   Median :0.5   Median :165.5   Median :61.00   Median :2.000  
##  Mean   : 7.50   Mean   :0.5   Mean   :167.0   Mean   :60.36   Mean   :1.857  
##  3rd Qu.:10.75   3rd Qu.:1.0   3rd Qu.:169.5   3rd Qu.:64.75   3rd Qu.:2.750  
##  Max.   :14.00   Max.   :1.0   Max.   :180.0   Max.   :85.00   Max.   :3.000  
##       BMI       
##  Min.   :18.03  
##  1st Qu.:20.41  
##  Median :21.28  
##  Mean   :21.50  
##  3rd Qu.:22.84  
##  Max.   :26.23
#-------------------------------------------------
datafile %>% 
  as_tibble() # 将数据集转换为tibble格式
## # A tibble: 14 × 6
##       id gender height weight attitude   BMI
##    <dbl>  <dbl>  <dbl>  <dbl>    <dbl> <dbl>
##  1     1      1    180     85        3  26.2
##  2     2      0    165     62        2  22.8
##  3     3      0    168     65        1  23.0
##  4     4      1    175     65        1  21.2
##  5     5      1    165     63        1  23.1
##  6     6      1    168     60        3  21.3
##  7     7      1    175     70        3  22.9
##  8     8      1    170     64        3  22.1
##  9     9      1    165     58        2  21.3
## 10    10      0    163     55        2  20.7
## 11    11      0    158     45        1  18.0
## 12    12      0    160     50        1  19.5
## 13    13      0    166     51        1  18.5
## 14    14      0    160     52        2  20.3

宽型转长型格式

# 方法一
datafile_long <- 
  datafile %>% 
  pivot_longer(
    cols = c(time1, time2), 
    names_to = "time", 
    values_to = "value"
    ) # 宽型转长型
# 方法二
# 示例数据:某实验前后测量的体重数据(单位:kg)
before <- c(70, 68, 75, 80, 72, 78, 74)
after <- c(68, 67, 73, 78, 71, 76, 73)
data <- data.frame(
  subject = rep(1:length(before),2),
  Group = rep(c("Before", "After"), each = length(before)),
  Weight = c(before, after)
)
data
##    subject  Group Weight
## 1        1 Before     70
## 2        2 Before     68
## 3        3 Before     75
## 4        4 Before     80
## 5        5 Before     72
## 6        6 Before     78
## 7        7 Before     74
## 8        1  After     68
## 9        2  After     67
## 10       3  After     73
## 11       4  After     78
## 12       5  After     71
## 13       6  After     76
## 14       7  After     73
# 方法三
set.seed(123)
data <- data.frame(
  Subject = factor(1:10),
  Time1 = rnorm(10, mean = 70, sd = 5),
  Time2 = rnorm(10, mean = 75, sd = 5),
  Time3 = rnorm(10, mean = 80, sd = 5)
)
data
##    Subject    Time1    Time2    Time3
## 1        1 67.19762 81.12041 74.66088
## 2        2 68.84911 76.79907 78.91013
## 3        3 77.79354 77.00386 74.86998
## 4        4 70.35254 75.55341 76.35554
## 5        5 70.64644 72.22079 76.87480
## 6        6 78.57532 83.93457 71.56653
## 7        7 72.30458 77.48925 84.18894
## 8        8 63.67469 65.16691 80.76687
## 9        9 66.56574 78.50678 74.30932
## 10      10 67.77169 72.63604 86.26907
# 将数据转换为长格式
long_data <- gather(data, key = "Time", value = "Score", Time1, Time2, Time3)
long_data
##    Subject  Time    Score
## 1        1 Time1 67.19762
## 2        2 Time1 68.84911
## 3        3 Time1 77.79354
## 4        4 Time1 70.35254
## 5        5 Time1 70.64644
## 6        6 Time1 78.57532
## 7        7 Time1 72.30458
## 8        8 Time1 63.67469
## 9        9 Time1 66.56574
## 10      10 Time1 67.77169
## 11       1 Time2 81.12041
## 12       2 Time2 76.79907
## 13       3 Time2 77.00386
## 14       4 Time2 75.55341
## 15       5 Time2 72.22079
## 16       6 Time2 83.93457
## 17       7 Time2 77.48925
## 18       8 Time2 65.16691
## 19       9 Time2 78.50678
## 20      10 Time2 72.63604
## 21       1 Time3 74.66088
## 22       2 Time3 78.91013
## 23       3 Time3 74.86998
## 24       4 Time3 76.35554
## 25       5 Time3 76.87480
## 26       6 Time3 71.56653
## 27       7 Time3 84.18894
## 28       8 Time3 80.76687
## 29       9 Time3 74.30932
## 30      10 Time3 86.26907

独立样本t检验

# 独立样本student's t test
head(datafile)
## # A tibble: 6 × 6
##      id gender height weight attitude   BMI
##   <dbl>  <dbl>  <dbl>  <dbl>    <dbl> <dbl>
## 1     1      1    180     85        3  26.2
## 2     2      0    165     62        2  22.8
## 3     3      0    168     65        1  23.0
## 4     4      1    175     65        1  21.2
## 5     5      1    165     63        1  23.1
## 6     6      1    168     60        3  21.3
t.result <- 
  t.test(
    height ~ gender, 
    data = datafile, 
    var.equal = TRUE
    ) 
t.result
## 
##  Two Sample t-test
## 
## data:  height by gender
## t = -3.2339, df = 12, p-value = 0.007167
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -13.868165  -2.703263
## sample estimates:
## mean in group 0 mean in group 1 
##        162.8571        171.1429
str(t.result)
## List of 10
##  $ statistic  : Named num -3.23
##   ..- attr(*, "names")= chr "t"
##  $ parameter  : Named num 12
##   ..- attr(*, "names")= chr "df"
##  $ p.value    : num 0.00717
##  $ conf.int   : num [1:2] -13.9 -2.7
##   ..- attr(*, "conf.level")= num 0.95
##  $ estimate   : Named num [1:2] 163 171
##   ..- attr(*, "names")= chr [1:2] "mean in group 0" "mean in group 1"
##  $ null.value : Named num 0
##   ..- attr(*, "names")= chr "difference in means between group 0 and group 1"
##  $ stderr     : num 2.56
##  $ alternative: chr "two.sided"
##  $ method     : chr " Two Sample t-test"
##  $ data.name  : chr "height by gender"
##  - attr(*, "class")= chr "htest"
t.result$method
## [1] " Two Sample t-test"
t.result$alternative
## [1] "two.sided"
t.result$null.value
## difference in means between group 0 and group 1 
##                                               0
t.result$estimate
## mean in group 0 mean in group 1 
##        162.8571        171.1429
t.result$statistic
##         t 
## -3.233888
t.result$parameter
## df 
## 12
t.result$p.value
## [1] 0.00716743
t.result$stderr
## [1] 2.562153
t.result$conf.int[1]
## [1] -13.86817
t.result$conf.int[2]
## [1] -2.703263
t.result$conf.int
## [1] -13.868165  -2.703263
## attr(,"conf.level")
## [1] 0.95
# 独立样本Welch's t检验
t.test(height ~ gender, data = datafile) 
## 
##  Welch Two Sample t-test
## 
## data:  height by gender
## t = -3.2339, df = 10.248, p-value = 0.00869
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -13.975874  -2.595555
## sample estimates:
## mean in group 0 mean in group 1 
##        162.8571        171.1429
# Welch's t-test的具体计算
result <- datafile %>%
  group_by(gender) %>%
  summarise(
    n = n(),
    mean = mean(height, na.rm = TRUE),
    sd = sd(height, na.rm = TRUE)
    ) # 按性别对身高进行描述性统计
result
## # A tibble: 2 × 4
##   gender     n  mean    sd
##    <dbl> <int> <dbl> <dbl>
## 1      0     7  163.  3.67
## 2      1     7  171.  5.70
mean(datafile$height[datafile$gender == 0])
## [1] 162.8571
mean(datafile$height[datafile$gender == 1])
## [1] 171.1429
t.value <- (result[1,3] - result[2,3])/sqrt(result[1,4]^2/result[1,2] + result[2,4]^2/result[2,2]) # 计算t值
# 计算Welch-Satterthwaite自由度
df.value <- (result[1,4]^2/result[1,2] + result[2,4]^2/result[2,2])^2/((result[1,4]^2/result[1,2])^2/(result[1,2]-1) + (result[2,4]^2/result[2,2])^2/(result[2,2]-1)) # 计算自由度
# 计算p值
t.value <- as.numeric(t.value)
df.value <- as.numeric(df.value)
p.value <- 2*pt(-abs(t.value), df.value)
t.value
## [1] -3.233888
df.value
## [1] 10.24801
p.value
## [1] 0.008689748

独立样本 t 检验简介

独立样本 t 检验 用于比较两个独立组的均值是否存在显著差异。这种检验适用于一个定量变量和一个二分类分组变量。


应用场景

  1. 比较两个实验组和对照组的效果。
  2. 比较男性和女性在某一变量(如收入、身高)上的差异。
  3. 在两组样本数据之间,判断是否有显著性差异。

假设

  • 原假设 \(H_0\):两组均值相等(\(\mu_1 = \mu_2\))。
  • 备择假设 \(H_1\)
    • 双尾检验:两组均值不相等(\(\mu_1 \neq \mu_2\))。
    • 单尾检验:一组均值大于或小于另一组(\(\mu_1 > \mu_2\)\(\mu_1 < \mu_2\))。

检验方法

根据两组的方差是否相等,分为两种情况: 1. 方差相等:使用标准 t 检验。 2. 方差不相等:使用 Welch t 检验(更稳健)。

独立样本 t 检验的公式

检验统计量公式

  1. 当方差相等时\[ t = \frac{\bar{X}_1 - \bar{X}_2}{\sqrt{s_p^2 \left(\frac{1}{n_1} + \frac{1}{n_2}\right)}} \] 其中:
  • \(\bar{X}_1, \bar{X}_2\):两组样本均值。
  • \(n_1, n_2\):两组样本大小。
  • \(s_p^2\):两组的合并方差,计算公式为: \[ s_p^2 = \frac{(n_1 - 1)s_1^2 + (n_2 - 1)s_2^2}{n_1 + n_2 - 2} \]
  1. 当方差不相等(Welch t 检验)\[ t = \frac{\bar{X}_1 - \bar{X}_2}{\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}} \]

  2. 自由度: 对于 Welch t 检验,自由度根据以下公式计算: \[ df = \frac{\left(\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}\right)^2}{\frac{\left(\frac{s_1^2}{n_1}\right)^2}{n_1 - 1} + \frac{\left(\frac{s_2^2}{n_2}\right)^2}{n_2 - 1}} \]

在 R 中执行独立样本 t 检验

1. 示例数据

# 示例数据
data <- data.frame(
  score = c(85, 90, 78, 92, 88, 76, 95, 89, 80, 84),
  group = c("A", "A", "B", "B", "A", "B", "A", "A", "B", "B")
)
# 执行独立样本 t 检验
t_test_result <- t.test(score ~ group, data = data)

# 查看结果
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  score by group
## t = 2.2665, df = 6.3952, p-value = 0.06128
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -0.4709231 15.2709231
## sample estimates:
## mean in group A mean in group B 
##            89.4            82.0

结果解释、注意事项与总结

结果解释

  1. t 值:表示两组均值差异的标准化大小,t 的绝对值越大,差异越显著。
  2. p 值:用于判断均值是否存在显著差异:
    • p > 0.05:无法拒绝原假设,两组均值无显著差异。
    • p ≤ 0.05:拒绝原假设,认为两组均值有显著差异。
  3. 自由度:影响 t 值和 p 值的计算,自由度越大,检验效能越高。
  4. 置信区间:提供均值差异的范围。如果包含 0,则均值无显著差异。
  5. 方差是否相等:根据 var.test() 的结果判断方差是否齐性,影响 t 检验的计算方式。

注意事项

  1. 方差齐性假设
    • 使用 var.test() 检验两组方差是否相等:
    var.test(score ~ group, data = data)
    • 如果方差相等,可在 t 检验中设置 var.equal = TRUE
  2. 单尾检验
    • 如果假设组 A 均值大于组 B 均值,可设置 alternative = "greater"
    • 如果假设组 A 均值小于组 B 均值,可设置 alternative = "less"
  3. 正态性检验
    • 小样本时(每组 < 30),需使用 shapiro.test() 检验正态性:
    shapiro.test(data$score[data$group == "A"])
    shapiro.test(data$score[data$group == "B"])
    • 如果数据不满足正态性假设,建议使用非参数检验(如 wilcox.test())。
  4. 样本量影响
    • 样本量过小时,p 值可能不够稳定;建议结合效应量分析。

总结

独立样本 t 检验是比较两组均值差异的重要工具。在 R 中,使用 t.test() 可快速实现: - 若方差齐性假设成立,可使用标准 t 检验。 - 若方差不齐性成立,默认使用 Welch t 检验。

在分析中,应根据数据的正态性、样本量和方差情况选择合适的检验方法。如不满足正态性假设,可用非参数检验替代。

两总体方差齐性检验

判断两组数据的方差是否相等,通常使用 F检验(F-test)或其他稳健的检验方法(Bartlett’s test、Levene’s test)。此外还可以使用箱线图或残差图、样本方差比值来判断方差是否大致相等。

# F test
var.test(height ~ gender, data = datafile)
## 
##  F test to compare two variances
## 
## data:  height by gender
## F = 0.41496, num df = 6, denom df = 6, p-value = 0.3086
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.07130127 2.41494298
## sample estimates:
## ratio of variances 
##           0.414956
# Levene's test
leveneTest(
  height ~ factor(gender),
  data = datafile
)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  1.3698 0.2646
##       12
# Bartlett's test
bartlett.test(height ~ factor(gender),
              data = datafile)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  height by factor(gender)
## Bartlett's K-squared = 1.0384, df = 1, p-value = 0.3082

F-test

F 检验用于比较两组数据的方差,适用于正态分布数据。

语法:

var.test(x, y, alternative = "two.sided", conf.level = 0.95)

其中:

  • x, y:要比较的两组数据。
  • alternative: “two.sided”:双侧检验(默认),检测方差是否相等。 “less”:检验 x 的方差是否小于 y。 “greater”:检验 x 的方差是否大于 y。
  • conf.level:置信水平,默认 95%。

Bartlett’s Test

Bartlett’s Test 的原理

Bartlett’s Test 的原假设和备择假设如下:

  • 原假设 \(H_0\):所有组的方差相等。
  • 备择假设 \(H_1\):至少有一组的方差不同。

公式

检验统计量 \(T\) 的公式为:

\[ T = \frac{(N - k) \ln(S^2) - \sum_{i=1}^{k} (n_i - 1) \ln(S_i^2)}{1 + \frac{1}{3(k-1)} \left(\sum_{i=1}^{k} \frac{1}{n_i - 1} - \frac{1}{N - k}\right)} \]

其中:

  • \(N\):总样本数。
  • \(k\):组数。
  • \(S^2\):整体样本的加权方差,计算公式为:

\[ S^2 = \frac{\sum_{i=1}^{k} (n_i - 1) S_i^2}{N - k} \]

  • \(n_i\):第 \(i\) 组的样本数。
  • \(S_i^2\):第 \(i\) 组的样本方差。

Bartlett’s Test 的检验统计量 \(T\) 近似服从 \(\chi^2\) 分布,自由度为 \(k - 1\)

Bartlett’s Test 的适用范围

  • Bartlett’s Test 适用于正态分布的数据。
  • 对非正态数据,Bartlett’s Test 对偏态较敏感,可能失去检验效能。

如果数据非正态,建议使用更稳健的方法,例如 Levene’s Test。

在 R 中使用 Bartlett’s Test

1. 函数:bartlett.test()

R 提供了内置的 bartlett.test() 函数,用于执行 Bartlett’s Test。

2. 示例数据

# 示例数据
data <- data.frame(
  height = c(160, 165, 170, 155, 150, 145, 175, 180, 165, 160, 150, 155),
  group = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D")
)

3. 运行Bartlett’s Test

# 执行 Bartlett's Test
result <- bartlett.test(height ~ group, data = data)

# 查看结果
print(result)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  height by group
## Bartlett's K-squared = 0.50223, df = 3, p-value = 0.9184
str(result)
## List of 5
##  $ statistic: Named num 0.502
##   ..- attr(*, "names")= chr "Bartlett's K-squared"
##  $ parameter: Named num 3
##   ..- attr(*, "names")= chr "df"
##  $ p.value  : num 0.918
##  $ data.name: chr "height by group"
##  $ method   : chr "Bartlett test of homogeneity of variances"
##  - attr(*, "class")= chr "htest"

4. 输出解释

运行 bartlett.test() 后,输出结果类似如下:

  1. 检验统计量 \(0.50223\),表明组间方差的差异程度。
  2. 自由度 \(df = 3\),表示数据中有 4 个组(\(4 - 1 = 3\))。
  3. p 值为 0.9184,大于 0.05,因此不能拒绝原假设,认为组间方差相等。

这意味着数据满足方差齐性的假设,可以继续执行单因素方差分析(ANOVA)。


Bartlett’s Test 的应用场景

  1. 方差齐性检验: 在执行单因素方差分析(ANOVA)之前,验证分组数据是否满足方差齐性假设。

  2. 组间方差比较: 检查多个实验组的方差是否显著不同。

Bartlett’s Test 的优缺点

优点

  • 非常适合正态分布的数据。
  • 是方差齐性检验的一种经典方法。

缺点

  • 对正态性假设敏感。如果数据分布偏离正态,结果可能不准确。
  • 对极端值和偏态数据较为敏感。

与 Levene’s Test 的对比

特点 Bartlett’s Test Levene’s Test
正态性假设 需要满足正态性 不需要严格满足正态性
稳健性 对非正态数据敏感 对非正态数据更稳健
用途 用于正态数据的方差齐性检验 用于正态和非正态数据的方差齐性检验
极端值的影响 容易受到极端值影响 对极端值更稳健

代码完整示例

以下是完整的代码示例:

# 示例数据
data <- data.frame(
  height = c(160, 165, 170, 155, 150, 145, 175, 180, 165, 160, 150, 155),
  group = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D")
)

# 执行 Bartlett's Test
bartlett_result <- bartlett.test(height ~ group, data = data)

# 打印结果
print(bartlett_result)

# 检查数据正态性(以每组为单位)
by(data$height, data$group, shapiro.test)

Levene’s Test

Levene’s Test 的原理

Levene’s Test 的假设如下:

  • 原假设 \(H_0\):所有组的方差相等。
  • 备择假设 \(H_1\):至少有一组的方差与其他组不同。

Levene’s Test 的基本过程

  1. 计算每组数据的偏差 \(Z_{ij}\)
  2. 计算组内偏差的均值 \(Z_{i.}\) 和总体偏差均值 \(Z_{..}\)
  3. 根据公式计算检验统计量 \(W\)
  4. 比较 \(W\)\(F\) 分布的临界值,或通过 p 值判断是否拒绝原假设。 #### 检验统计量公式

Levene’s Test 的检验统计量 \(W\) 定义如下:

\[ W = \frac{(N - k)}{(k - 1)} \cdot \frac{\sum_{i=1}^k n_i (Z_{i.} - Z_{..})^2}{\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{i.})^2} \]

其中:

  • \(N\):总样本数。
  • \(k\):组数。
  • \(n_i\):第 \(i\) 组的样本大小。
  • \(Z_{ij}\):第 \(i\) 组中第 \(j\) 个样本与其组中心(均值或中位数)的绝对偏差。 \[ Z_{ij} = |Y_{ij} - \text{center}_i| \]
    • 如果 center 选用均值:\(\text{center}_i = \bar{Y}_i\)
    • 如果 center 选用中位数:\(\text{center}_i = \text{median}(Y_i)\)
  • \(Z_{i.}\):第 \(i\) 组的 \(Z_{ij}\) 的均值。
  • \(Z_{..}\):所有 \(Z_{ij}\) 的总体均值。

公式解释

  • 分子表示组间偏差平方和,衡量组间差异。
  • 分母表示组内偏差平方和,衡量组内差异。
  • \(W\) 服从 \(F\) 分布,自由度为 \((k-1, N-k)\)

Levene’s Test 的适用范围

  • 数据可以是正态分布或非正态分布。
  • 对于存在异常值的数据,Levene’s Test 更稳健。
  • 分组变量应为因子型,响应变量应为定量型。

在 R 中使用 Levene’s Test

1. 函数:leveneTest()

leveneTest() 是 R 中 car 包提供的函数,用于执行 Levene’s Test。

2. 示例数据

# 示例数据
data <- data.frame(
  height = c(160, 165, 170, 155, 150, 145, 175, 180, 165, 160, 150, 155),
  group = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D")
)
# 如果尚未安装 car 包
# install.packages("car")

# 加载包
# library(car)
# 执行 Levene’s Test
result <- leveneTest(height ~ as.factor(group), data = data)

# 查看结果
print(result)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  0.1667 0.9159
##        8
str(result)
## Classes 'anova' and 'data.frame':    2 obs. of  3 variables:
##  $ Df     : int  3 8
##  $ F value: num  0.167 NA
##  $ Pr(>F) : num  0.916 NA
##  - attr(*, "heading")= chr "Levene's Test for Homogeneity of Variance (center = median)"

3. 结果解释

  1. F value:检验统计量,表示组间偏差的差异程度。
    • 在此结果中,\(F = 0.1667\)
  2. Df:自由度,包括组间自由度(\(k - 1\))和组内自由度(\(N - k\))。
    • 自由度 \(Df = 3\) 表示有 4 个组(组间自由度为 \(4 - 1 = 3\))。
  3. Pr(>F):p 值,用于判断是否拒绝原假设。
  • 如果 p 值 > 0.05,则不能拒绝原假设,认为组间方差相等。
  • 如果 p 值 ≤ 0.05,则拒绝原假设,认为至少有一组的方差与其他组不同。
  • 在此结果中,\(p = 0.9159\),大于 0.05,不能拒绝原假设,说明组间方差没有显著差异。

注意

  • 中心选择的影响:在默认情况下,Levene’s Test 使用中位数作为中心。如果数据接近正态分布,选择均值可能更合适。
  • 结果解读需结合具体数据背景,尤其是在边界 p 值(如接近 0.05)时。

Levene’s Test 的中心选择

leveneTest() 默认使用中位数作为中心计算偏差(center = "median"),适合非正态分布数据。

如果数据接近正态分布,可以选择均值作为中心:

leveneTest(height ~ group, data = data, center = "mean")

Levene’s Test 的应用场景

  1. 单因素方差分析(ANOVA): 在 ANOVA 中,需要验证数据是否满足方差齐性假设。

  2. 处理方差不齐的情况: 如果方差不齐,可以使用更稳健的方法(如 Welch 方差分析):

   oneway.test(height ~ group, data = data)

Levene’s Test 的优缺点

优点

  • 对非正态分布数据稳健。
  • 比 Bartlett’s Test 更适合包含异常值的数据。

缺点

  • 在样本量特别小时,可能检验效能较低。
  • 偏态数据仍可能影响结果。

代码完整示例

# 示例数据
data <- data.frame(
  height = c(160, 165, 170, 155, 150, 145, 175, 180, 165, 160, 150, 155),
  group = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D")
)

# 安装并加载 car 包
install.packages("car")
library(car)

# 执行 Levene’s Test
levene_result <- leveneTest(height ~ group, data = data)

# 打印结果
print(levene_result)

# 以均值为中心计算
leveneTest(height ~ group, data = data, center = "mean")

可视化检查方差差异

绘制箱线图、残差图、标准差条形图等来判断方差是否大致相等.

箱线图1

boxplot(height ~ group, data, main = "Boxplot of Groups", col = c("lightblue", "lightgreen","lightpink"))

箱线图2

library(ggplot2)
ggplot(data, aes(x = group, y = height)) +
  geom_boxplot() +
  stat_summary(fun = "var", geom = "point", shape = 23, size = 4, fill = "red") +
  labs(title = "Variance Comparison")

标准差条形图

# 计算每组均值和标准差
summary_df <- data %>%
  group_by(group) %>%
  summarise(mean_value = mean(height),
            sd_value = sd(height))

print(summary_df)
## # A tibble: 4 × 3
##   group mean_value sd_value
##   <chr>      <dbl>    <dbl>
## 1 A           165      5   
## 2 B           150      5   
## 3 C           173.     7.64
## 4 D           155      5
ggplot(summary_df, aes(x = group, y = mean_value)) +
  geom_bar(stat = "identity", fill = "lightblue", color = "black") +  # 条形图
  geom_errorbar(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value),
                width = 0.2, color = "red") +  # 标准差误差条
  labs(title = "Group Mean with Standard Deviation",
       x = "Group", y = "Mean Value") +
  theme_minimal()

直接比较样本方差

直接计算样本方差并比较大小,判断方差是否大致相等。

单因素方差分析

# 单因素方差分析
 aov(height ~ factor(attitude), 
      data = datafile)
## Call:
##    aov(formula = height ~ factor(attitude), data = datafile)
## 
## Terms:
##                 factor(attitude) Residuals
## Sum of Squares          229.1667  286.8333
## Deg. of Freedom                2        11
## 
## Residual standard error: 5.106443
## Estimated effects may be unbalanced
aov.result <- 
  aov(height ~ factor(attitude), 
      data = datafile)
summary(aov.result)
##                  Df Sum Sq Mean Sq F value Pr(>F)  
## factor(attitude)  2  229.2  114.58   4.394 0.0396 *
## Residuals        11  286.8   26.08                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
str(aov.result)
## List of 13
##  $ coefficients : Named num [1:3] 165.33 -2.08 7.92
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "factor(attitude)2" "factor(attitude)3"
##  $ residuals    : Named num [1:14] 6.75 1.75 2.667 9.667 -0.333 ...
##   ..- attr(*, "names")= chr [1:14] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:14] -624.86 8.87 12.26 7.94 -2.06 ...
##   ..- attr(*, "names")= chr [1:14] "(Intercept)" "factor(attitude)2" "factor(attitude)3" "" ...
##  $ rank         : int 3
##  $ fitted.values: Named num [1:14] 173 163 165 165 165 ...
##   ..- attr(*, "names")= chr [1:14] "1" "2" "3" "4" ...
##  $ assign       : int [1:3] 0 1 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:14, 1:3] -3.742 0.267 0.267 0.267 0.267 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:14] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:3] "(Intercept)" "factor(attitude)2" "factor(attitude)3"
##   .. ..- attr(*, "assign")= int [1:3] 0 1 1
##   .. ..- attr(*, "contrasts")=List of 1
##   .. .. ..$ factor(attitude): chr "contr.treatment"
##   ..$ qraux: num [1:3] 1.27 1.46 1.35
##   ..$ pivot: int [1:3] 1 2 3
##   ..$ tol  : num 1e-07
##   ..$ rank : int 3
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 11
##  $ contrasts    :List of 1
##   ..$ factor(attitude): chr "contr.treatment"
##  $ xlevels      :List of 1
##   ..$ factor(attitude): chr [1:3] "1" "2" "3"
##  $ call         : language aov(formula = height ~ factor(attitude), data = datafile)
##  $ terms        :Classes 'terms', 'formula'  language height ~ factor(attitude)
##   .. ..- attr(*, "variables")= language list(height, factor(attitude))
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "height" "factor(attitude)"
##   .. .. .. ..$ : chr "factor(attitude)"
##   .. ..- attr(*, "term.labels")= chr "factor(attitude)"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(height, factor(attitude))
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:2] "height" "factor(attitude)"
##  $ model        :'data.frame':   14 obs. of  2 variables:
##   ..$ height          : num [1:14] 180 165 168 175 165 168 175 170 165 163 ...
##   ..$ factor(attitude): Factor w/ 3 levels "1","2","3": 3 2 1 1 1 3 3 3 2 2 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language height ~ factor(attitude)
##   .. .. ..- attr(*, "variables")= language list(height, factor(attitude))
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "height" "factor(attitude)"
##   .. .. .. .. ..$ : chr "factor(attitude)"
##   .. .. ..- attr(*, "term.labels")= chr "factor(attitude)"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(height, factor(attitude))
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "height" "factor(attitude)"
##  - attr(*, "class")= chr [1:2] "aov" "lm"
# 如果p<0.05,进行事后检验,多重比较
tukey.result <- 
  TukeyHSD(aov.result)
tukey.result
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = height ~ factor(attitude), data = datafile)
## 
## $`factor(attitude)`
##          diff         lwr       upr     p adj
## 2-1 -2.083333 -10.9858829  6.819216 0.8059400
## 3-1  7.916667  -0.9858829 16.819216 0.0827135
## 3-2 10.000000   0.2477456 19.752254 0.0444816
# 可视化结果
plot(tukey.result)

boxplot(height ~ attitude, data = datafile,
        main = "Height by Attitude",
        xlab = "Attitude",
        ylab = "Height",
        col = c("skyblue", "pink", "lightgreen"))

回归分析

datafile[1:5,] # 查看数据集的前5行
## # A tibble: 5 × 6
##      id gender height weight attitude   BMI
##   <dbl>  <dbl>  <dbl>  <dbl>    <dbl> <dbl>
## 1     1      1    180     85        3  26.2
## 2     2      0    165     62        2  22.8
## 3     3      0    168     65        1  23.0
## 4     4      1    175     65        1  21.2
## 5     5      1    165     63        1  23.1
#数据分布
hist(datafile$weight, 
     main = "Weight Distribution", 
     xlab = "Weight", 
     ylab = "Frequency", 
     col = "skyblue")

hist(datafile$height, 
     main = "Weight Distribution", 
     xlab = "Weight", 
     ylab = "Frequency", 
     col = "skyblue")

# 描述统计量
N  <-  sapply(datafile, length)
Mean  <-  sapply(datafile, mean)
SD  <-  sapply(datafile, sd)
Min <- sapply(datafile, min)
Med <- sapply(datafile, median)
Max <- sapply(datafile, max)
result <- cbind(N, Mean, SD, Min, Med, Max)
result
##           N       Mean         SD       Min       Med       Max
## id       14   7.500000  4.1833001   1.00000   7.50000  14.00000
## gender   14   0.500000  0.5188745   0.00000   0.50000   1.00000
## height   14 167.000000  6.3001831 158.00000 165.50000 180.00000
## weight   14  60.357143 10.0046692  45.00000  61.00000  85.00000
## attitude 14   1.857143  0.8644378   1.00000   2.00000   3.00000
## BMI      14  21.503286  2.1203676  18.02596  21.28123  26.23457
# 箱式图
datafile$cat <- as.factor((datafile$weight > median(datafile$weight))*1)
head(datafile)
## # A tibble: 6 × 7
##      id gender height weight attitude   BMI cat  
##   <dbl>  <dbl>  <dbl>  <dbl>    <dbl> <dbl> <fct>
## 1     1      1    180     85        3  26.2 1    
## 2     2      0    165     62        2  22.8 1    
## 3     3      0    168     65        1  23.0 1    
## 4     4      1    175     65        1  21.2 1    
## 5     5      1    165     63        1  23.1 1    
## 6     6      1    168     60        3  21.3 0
levels(datafile$cat) <- c("light", "heavy")
boxplot(height ~ cat, data = datafile, 
        main = "Height by Weight Category",
        xlab = "Weight Category",
        ylab = "Height",
        col = c("skyblue", "pink"))

# 全模型
model <- lm(weight ~ height, data = datafile)
summary(model)
## 
## Call:
## lm(formula = weight ~ height, data = datafile)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9114 -1.8930  0.0944  2.8386  5.8483 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -181.0808    31.6915  -5.714 9.70e-05 ***
## height         1.4457     0.1896   7.623 6.14e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.308 on 12 degrees of freedom
## Multiple R-squared:  0.8289, Adjusted R-squared:  0.8146 
## F-statistic: 58.12 on 1 and 12 DF,  p-value: 6.138e-06
# 模拟1000抽样80%的数据
nsimu <- 1000
ss <- nrow(datafile)
ss0 <- round(ss*0.8)
R2 <- rep(0,nsimu)

for (i in 1:nsimu) {
  index <- sample(1:ss, ss0)
  model <- lm(weight ~ height, data = datafile[index,])
  y.hat <- predict(model, datafile[-index,3])
  y.true <- datafile$weight[-index]
  R2[i] <- 1 - sum((y.true - y.hat)^2)/sum((y.true - mean(y.true))^2)
  R2[i] <- R2[i] * 100
}
head(R2)
## [1]  67.01724  77.41251  95.68766  58.71462  77.13235 -30.27099
boxplot(R2, main = "R2 Distribution", col = "skyblue")

# 计算VIF
model <- lm(weight ~ height + gender, data = datafile)
vif(model)
##   height   gender 
## 1.871503 1.871503
# 计算Cook距离, 图示
plot(model, which = 4)

# 残差分析,图示
plot(model, which = 1:6)

# 模型的AIC,BIC
# 用于评估模型的优劣,特别是在平衡模型的拟合程度和复杂度时
# AIC()与step()中的AIC()结果不一样,相差一个常数nln(2pi)+n+2
model.aic <- step(model)
## Start:  AIC=44.72
## weight ~ height + gender
## 
##          Df Sum of Sq    RSS    AIC
## - gender  1      0.18 222.69 42.734
## <none>                222.52 44.723
## - height  1    562.62 785.14 60.375
## 
## Step:  AIC=42.73
## weight ~ height
## 
##          Df Sum of Sq     RSS    AIC
## <none>                 222.69 42.734
## - height  1    1078.5 1301.21 65.448
model.bic <- step(model, k = log(ss))
## Start:  AIC=46.64
## weight ~ height + gender
## 
##          Df Sum of Sq    RSS    AIC
## - gender  1      0.18 222.69 44.013
## <none>                222.52 46.641
## - height  1    562.62 785.14 61.653
## 
## Step:  AIC=44.01
## weight ~ height
## 
##          Df Sum of Sq     RSS    AIC
## <none>                 222.69 44.013
## - height  1    1078.5 1301.21 66.087
summary(model.aic)
## 
## Call:
## lm(formula = weight ~ height, data = datafile)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9114 -1.8930  0.0944  2.8386  5.8483 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -181.0808    31.6915  -5.714 9.70e-05 ***
## height         1.4457     0.1896   7.623 6.14e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.308 on 12 degrees of freedom
## Multiple R-squared:  0.8289, Adjusted R-squared:  0.8146 
## F-statistic: 58.12 on 1 and 12 DF,  p-value: 6.138e-06
model.aic$anova[1,"AIC"]
## [1] 44.72337
AIC(model)
## [1] 86.45365
delta.aic <- AIC(model) - model.aic$anova[1,"AIC"]
delta.aic
## [1] 41.73028
N[1]*log(2*pi) + N[1] + 2
##       id 
## 41.73028
# 人群细分
datafile$height_new <- as.factor((datafile$height > median(datafile$height))*1)
datafile$gender_new <- as.factor((datafile$gender > median(datafile$gender))*1)
table(datafile$height_new, datafile$gender_new)/nrow(datafile)
##    
##             0         1
##   0 0.3571429 0.1428571
##   1 0.1428571 0.3571429
table(datafile$gender_new)
## 
## 0 1 
## 7 7

双因素方差分析

# 读入数据house.csv
file.exists("house.csv")
## [1] TRUE
house <- read.csv("house.csv", fileEncoding = "GBK")
head(house)
##   district bedrooms halls      area  floor subway school    price
## 1     丰台        2     2 109.38765    low      0      0 4.920575
## 2   石景山        2     1  55.01044   high      1      0 3.369027
## 3   石景山        2     1  79.69252   high      1      0 3.769788
## 4     西城        2     1  52.47666   high      1      1 8.319766
## 5     东城        1     1  50.81697    low      1      1 8.077537
## 6     西城        2     0  67.15277 middle      1      1 8.131468
house[1:5,]
##   district bedrooms halls      area floor subway school    price
## 1     丰台        2     2 109.38765   low      0      0 4.920575
## 2   石景山        2     1  55.01044  high      1      0 3.369027
## 3   石景山        2     1  79.69252  high      1      0 3.769788
## 4     西城        2     1  52.47666  high      1      1 8.319766
## 5     东城        1     1  50.81697   low      1      1 8.077537
# 因变量原始及对数变换后的直方图
par(mfrow = c(1, 2))
hist(house$price, 
     main = "Price Distribution", 
     xlab = "Price", 
     ylab = "Frequency",
     col = "skyblue")
hist(log(house$price), 
     main = "Log Price Distribution", 
     xlab = "Log Price", 
     ylab = "Frequency",
     col = "skyblue")

# 连续性自变量的直方图
par(mfrow = c(1, 1))
hist(house$area, 
     main = "Area Distribution", 
     xlab = "Area", 
     ylab = "Frequency",
     col = "skyblue")

# 按行政区划分对价格的对数进行描述性统计并排序

library(tidyverse)
library(stringi)
house %>%
  group_by(district) %>%
  summarise(
    n = n(),
    mean = mean(log(price), na.rm = TRUE),
    sd = sd(log(price), na.rm = TRUE),
    min = min(log(price), na.rm = TRUE),
    med = median(log(price), na.rm = TRUE),
    max = max(log(price), na.rm = TRUE)
  ) %>%
 mutate(pinyin = stri_trans_general(district, "Latin")) %>% # 转换为拼音
  arrange(pinyin) %>% # 按拼音排序,若降序用desc()
  select(-pinyin) # 去除临时拼音列
## # A tibble: 6 × 7
##   district     n  mean    sd   min   med   max
##   <chr>    <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 朝阳      2624  1.73 0.231 0.996  1.73  2.59
## 2 东城      2600  2.01 0.261 0.897  2.07  2.73
## 3 丰台      2721  1.54 0.195 0.694  1.54  2.25
## 4 海淀      2687  1.97 0.239 1.09   1.98  2.64
## 5 石景山    1806  1.48 0.224 0.744  1.45  2.34
## 6 西城      2562  2.18 0.231 0.879  2.20  2.76
# 建立全模型a
# 将基准类别设置为 "西城"
house$district <- relevel(factor(house$district), ref = "朝阳")
# 另一种方法
# house$district <- factor(house$district, levels = c("朝阳", "西城", "东城", "丰台", "海淀", "石景山"))
model.a <- lm(log(price) ~ district*subway + school + floor + as.factor(bedrooms) + as.factor(halls) + area, data = house)
summary(model.a)
## 
## Call:
## lm(formula = log(price) ~ district * subway + school + floor + 
##     as.factor(bedrooms) + as.factor(halls) + area, data = house)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2292 -0.1327 -0.0015  0.1372  0.8874 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.563e+00  1.581e-02  98.858  < 2e-16 ***
## district东城           1.290e-01  2.222e-02   5.802 6.68e-09 ***
## district丰台          -1.059e-01  1.516e-02  -6.987 2.92e-12 ***
## district海淀           2.082e-01  1.558e-02  13.366  < 2e-16 ***
## district石景山        -1.110e-01  1.517e-02  -7.314 2.73e-13 ***
## district西城           4.453e-01  1.810e-02  24.606  < 2e-16 ***
## subway                 1.506e-01  1.341e-02  11.228  < 2e-16 ***
## school                 1.564e-01  4.316e-03  36.235  < 2e-16 ***
## floorlow               3.151e-02  4.300e-03   7.328 2.45e-13 ***
## floormiddle            2.468e-02  4.197e-03   5.880 4.18e-09 ***
## as.factor(bedrooms)2   9.598e-03  4.940e-03   1.943 0.052028 .  
## as.factor(bedrooms)3   2.712e-02  6.511e-03   4.165 3.13e-05 ***
## as.factor(bedrooms)4   6.054e-02  1.248e-02   4.853 1.23e-06 ***
## as.factor(bedrooms)5   1.127e-01  2.434e-02   4.632 3.65e-06 ***
## as.factor(halls)1      2.930e-02  8.396e-03   3.490 0.000484 ***
## as.factor(halls)2      1.257e-01  9.842e-03  12.771  < 2e-16 ***
## as.factor(halls)3      1.546e-01  2.722e-02   5.681 1.37e-08 ***
## area                  -1.014e-03  7.084e-05 -14.315  < 2e-16 ***
## district东城:subway    1.242e-01  2.306e-02   5.384 7.38e-08 ***
## district丰台:subway   -4.341e-02  1.640e-02  -2.647 0.008138 ** 
## district海淀:subway    8.668e-03  1.683e-02   0.515 0.606458    
## district石景山:subway -1.013e-01  1.691e-02  -5.988 2.18e-09 ***
## district西城:subway   -4.869e-02  1.901e-02  -2.561 0.010440 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.212 on 14977 degrees of freedom
## Multiple R-squared:  0.6097, Adjusted R-squared:  0.6091 
## F-statistic:  1063 on 22 and 14977 DF,  p-value: < 2.2e-16
# 建立对照模型b
model.b <- lm(log(price) ~ district*subway + school + floor + as.factor(bedrooms) + area, data = house)
summary(model.b)
## 
## Call:
## lm(formula = log(price) ~ district * subway + school + floor + 
##     as.factor(bedrooms) + area, data = house)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2359 -0.1361 -0.0025  0.1397  0.9058 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.571e+00  1.444e-02 108.797  < 2e-16 ***
## district东城           1.089e-01  2.246e-02   4.851 1.24e-06 ***
## district丰台          -1.188e-01  1.532e-02  -7.758 9.17e-15 ***
## district海淀           1.975e-01  1.575e-02  12.540  < 2e-16 ***
## district石景山        -1.248e-01  1.533e-02  -8.139 4.28e-16 ***
## district西城           4.236e-01  1.827e-02  23.189  < 2e-16 ***
## subway                 1.404e-01  1.356e-02  10.355  < 2e-16 ***
## school                 1.573e-01  4.361e-03  36.068  < 2e-16 ***
## floorlow               3.140e-02  4.349e-03   7.220 5.43e-13 ***
## floormiddle            2.478e-02  4.244e-03   5.839 5.36e-09 ***
## as.factor(bedrooms)2   1.225e-02  4.876e-03   2.512 0.012005 *  
## as.factor(bedrooms)3   2.905e-02  6.551e-03   4.435 9.28e-06 ***
## as.factor(bedrooms)4   4.921e-02  1.258e-02   3.913 9.17e-05 ***
## as.factor(bedrooms)5   8.488e-02  2.427e-02   3.497 0.000473 ***
## area                  -3.326e-04  6.154e-05  -5.406 6.56e-08 ***
## district东城:subway    1.284e-01  2.333e-02   5.504 3.77e-08 ***
## district丰台:subway   -3.577e-02  1.659e-02  -2.157 0.031059 *  
## district海淀:subway    1.013e-02  1.702e-02   0.595 0.551875    
## district石景山:subway -9.458e-02  1.710e-02  -5.530 3.26e-08 ***
## district西城:subway   -3.991e-02  1.922e-02  -2.077 0.037864 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2144 on 14980 degrees of freedom
## Multiple R-squared:  0.6005, Adjusted R-squared:    0.6 
## F-statistic:  1185 on 19 and 14980 DF,  p-value: < 2.2e-16
# 模型比较a与b
anova(model.b, model.a)
## Analysis of Variance Table
## 
## Model 1: log(price) ~ district * subway + school + floor + as.factor(bedrooms) + 
##     area
## Model 2: log(price) ~ district * subway + school + floor + as.factor(bedrooms) + 
##     as.factor(halls) + area
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1  14980 688.72                                  
## 2  14977 672.85  3    15.866 117.72 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 考察行政区划与有无地铁的交互作用
model.c <- lm(log(price) ~ district + subway + school + floor + as.factor(bedrooms) + as.factor(halls) + area, data = house)
anova(model.c, model.a)
## Analysis of Variance Table
## 
## Model 1: log(price) ~ district + subway + school + floor + as.factor(bedrooms) + 
##     as.factor(halls) + area
## Model 2: log(price) ~ district * subway + school + floor + as.factor(bedrooms) + 
##     as.factor(halls) + area
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1  14982 679.04                                  
## 2  14977 672.85  5      6.19 27.557 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## AIC与BIC模型选择
housenew <- house
ss <- nrow(housenew)
housenew$bedrooms <- floor(6*runif(ss))
head(housenew)
##   district bedrooms halls      area  floor subway school    price
## 1     丰台        0     2 109.38765    low      0      0 4.920575
## 2   石景山        0     1  55.01044   high      1      0 3.369027
## 3   石景山        1     1  79.69252   high      1      0 3.769788
## 4     西城        5     1  52.47666   high      1      1 8.319766
## 5     东城        1     1  50.81697    low      1      1 8.077537
## 6     西城        3     0  67.15277 middle      1      1 8.131468
model.s <- lm(log(price) ~ district*subway + school + floor + as.factor(bedrooms) + as.factor(halls) + area, data = housenew)
summary(model.s)
## 
## Call:
## lm(formula = log(price) ~ district * subway + school + floor + 
##     as.factor(bedrooms) + as.factor(halls) + area, data = housenew)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.22834 -0.13350 -0.00118  0.13832  0.87451 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.556e+00  1.617e-02  96.222  < 2e-16 ***
## district东城           1.287e-01  2.225e-02   5.785 7.40e-09 ***
## district丰台          -1.061e-01  1.518e-02  -6.994 2.78e-12 ***
## district海淀           2.091e-01  1.559e-02  13.410  < 2e-16 ***
## district石景山        -1.101e-01  1.519e-02  -7.244 4.58e-13 ***
## district西城           4.460e-01  1.811e-02  24.620  < 2e-16 ***
## subway                 1.495e-01  1.343e-02  11.133  < 2e-16 ***
## school                 1.577e-01  4.313e-03  36.568  < 2e-16 ***
## floorlow               2.978e-02  4.296e-03   6.933 4.30e-12 ***
## floormiddle            2.283e-02  4.190e-03   5.449 5.15e-08 ***
## as.factor(bedrooms)1  -6.244e-03  5.962e-03  -1.047 0.294948    
## as.factor(bedrooms)2  -3.090e-03  5.980e-03  -0.517 0.605359    
## as.factor(bedrooms)3  -3.404e-03  6.039e-03  -0.564 0.572948    
## as.factor(bedrooms)4   3.996e-03  5.956e-03   0.671 0.502298    
## as.factor(bedrooms)5  -1.897e-03  6.028e-03  -0.315 0.753036    
## as.factor(halls)1      2.889e-02  8.175e-03   3.534 0.000411 ***
## as.factor(halls)2      1.232e-01  9.630e-03  12.790  < 2e-16 ***
## as.factor(halls)3      1.683e-01  2.697e-02   6.239 4.51e-10 ***
## area                  -7.394e-04  5.313e-05 -13.916  < 2e-16 ***
## district东城:subway    1.244e-01  2.309e-02   5.389 7.19e-08 ***
## district丰台:subway   -4.169e-02  1.642e-02  -2.539 0.011138 *  
## district海淀:subway    1.078e-02  1.684e-02   0.640 0.522182    
## district石景山:subway -9.935e-02  1.693e-02  -5.868 4.52e-09 ***
## district西城:subway   -4.609e-02  1.903e-02  -2.422 0.015434 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2122 on 14976 degrees of freedom
## Multiple R-squared:  0.6088, Adjusted R-squared:  0.6082 
## F-statistic:  1013 on 23 and 14976 DF,  p-value: < 2.2e-16
model.aic <- step(model.s, trace = F)
model.bic <- step(model.s, k = log(ss), trace = F)
summary(model.aic)
## 
## Call:
## lm(formula = log(price) ~ district + subway + school + floor + 
##     as.factor(halls) + area + district:subway, data = housenew)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.22977 -0.13372 -0.00062  0.13812  0.87294 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.554e+00  1.569e-02  99.074  < 2e-16 ***
## district东城           1.286e-01  2.225e-02   5.780 7.60e-09 ***
## district丰台          -1.064e-01  1.517e-02  -7.013 2.44e-12 ***
## district海淀           2.090e-01  1.559e-02  13.404  < 2e-16 ***
## district石景山        -1.103e-01  1.519e-02  -7.263 3.96e-13 ***
## district西城           4.459e-01  1.811e-02  24.624  < 2e-16 ***
## subway                 1.493e-01  1.343e-02  11.118  < 2e-16 ***
## school                 1.578e-01  4.313e-03  36.584  < 2e-16 ***
## floorlow               2.979e-02  4.295e-03   6.937 4.18e-12 ***
## floormiddle            2.287e-02  4.189e-03   5.460 4.84e-08 ***
## as.factor(halls)1      2.875e-02  8.171e-03   3.518 0.000436 ***
## as.factor(halls)2      1.230e-01  9.628e-03  12.778  < 2e-16 ***
## as.factor(halls)3      1.683e-01  2.697e-02   6.241 4.46e-10 ***
## area                  -7.393e-04  5.312e-05 -13.918  < 2e-16 ***
## district东城:subway    1.246e-01  2.308e-02   5.397 6.87e-08 ***
## district丰台:subway   -4.132e-02  1.642e-02  -2.517 0.011847 *  
## district海淀:subway    1.084e-02  1.684e-02   0.644 0.519578    
## district石景山:subway -9.905e-02  1.693e-02  -5.852 4.97e-09 ***
## district西城:subway   -4.605e-02  1.902e-02  -2.421 0.015484 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2122 on 14981 degrees of freedom
## Multiple R-squared:  0.6087, Adjusted R-squared:  0.6082 
## F-statistic:  1295 on 18 and 14981 DF,  p-value: < 2.2e-16

logistic回归

# 读入数据
file.exists("logitdata.csv")
## [1] TRUE
logitdata <- read.csv("logitdata.csv")
head(logitdata)
##          ARA    ASSET    ATO         ROA     GROWTH       LEV SHARE ST
## 1 0.19230963 19.85605 0.0052 0.087709802 -0.9507273 0.4458801 26.89  0
## 2 0.22011996 20.91086 0.0056 0.016820383 -0.9426563 0.3986864 39.62  0
## 3 0.32529169 19.35262 0.0166 0.042468332 -0.9374404 0.3033481 26.46  0
## 4 0.02572868 21.43893 0.0028 0.018151630 -0.8529953 0.7582502 60.16  0
## 5 0.53359089 21.61334 0.2552 0.004146607 -0.8167039 0.7268753 54.24  1
## 6 0.06127521 21.04117 0.1248 0.051080619 -0.8102884 0.4017503 57.14  0
dim(logitdata)
## [1] 684   8
sum(logitdata$ST)
## [1] 36
mean(logitdata$ST)
## [1] 0.05263158
# 自变量ARA的直方图
hist(logitdata$ARA, 
     main = "应收款占比", 
     xlab = "ARA", 
     ylab = "Frequency",
     col = "skyblue")

par(mfrow = c(2, 3))
hist(logitdata$ASSET, 
     main = "对数资产归模", 
     xlab = "ASSET", 
     ylab = "Frequency",
     col = "skyblue")
hist(logitdata$ATO, 
     main = "资产周转率", 
     xlab = "ATO", 
     ylab = "Frequency",
     col = "skyblue")
hist(logitdata$ROA,
     main = "资产回报率",
     xlab = "ROA",
     ylab = "Frequency",
     col = "skyblue")
hist(logitdata$GROWTH,
     main = "销售收入增长率",
     xlab = "GROWTH",
     ylab = "Frequency",
     col = "skyblue")
hist(logitdata$LEV,
     main = "杠杆率",
     xlab = "LEV",
     ylab = "Frequency",
     col = "skyblue")
hist(logitdata$SHARE,
     main = "第一大股东持股比率",
     xlab = "SHARE",
     ylab = "Frequency",
     col = "skyblue")

# 因变量:是否ST
par(mfrow = c(1, 1))
boxplot(ARA ~ ST, data = logitdata, 
        xlab = "是否ST",
        ylab = "应收账款占比",
        names = c("非ST", "ST"),
        col = c("skyblue", "pink"))

par(mfrow = c(2, 3))
boxplot(ASSET ~ ST, data = logitdata, 
        xlab = "是否ST",
        ylab = "对数资产规模",
        names = c("非ST", "ST"),
        col = c("skyblue", "pink"))
boxplot(ATO ~ ST, data = logitdata, 
        xlab = "是否ST",
        ylab = "资产周转率",
        names = c("非ST", "ST"),
        col = c("skyblue", "pink"))
boxplot(ROA ~ ST, data = logitdata, 
        xlab = "是否ST",
        ylab = "资产回报率",
        names = c("非ST", "ST"),
        col = c("skyblue", "pink"))
boxplot(GROWTH ~ ST, data = logitdata, 
        xlab = "是否ST",
        ylab = "销售收入增长率",
        names = c("非ST", "ST"),
        col = c("skyblue", "pink"))
boxplot(LEV ~ ST, data = logitdata, 
        xlab = "是否ST",
        ylab = "杠杆水平",
        names = c("非ST", "ST"),
        col = c("skyblue", "pink"))
boxplot(SHARE ~ ST, data = logitdata, 
        xlab = "是否ST",
        ylab = "第一大股东持股比率",
        names = c("非ST", "ST"),
        col = c("skyblue", "pink"))

# 因变量、自变量的描述性统计
N  <-  sapply(logitdata, length)
Mean  <-  sapply(logitdata, mean)
SD  <-  sapply(logitdata, sd)
Min <- sapply(logitdata, min)
Med <- sapply(logitdata, median)
Max <- sapply(logitdata, max)
result <- cbind(N, Mean, SD, Min, Med, Max)
result
##          N        Mean          SD         Min         Med        Max
## ARA    684  0.09504945  0.09228931  0.00000000  0.06832718  0.6346842
## ASSET  684 20.77785347  0.83352322 18.66070036 20.70050279 24.0176107
## ATO    684  0.51977383  0.36282648  0.00280000  0.43340000  3.1513000
## ROA    684  0.05587011  0.03859391  0.00008170  0.05125798  0.3111300
## GROWTH 684  0.11525745  0.30702005 -0.95072732  0.10228264  0.9985565
## LEV    684  0.40606356  0.16576397  0.01843107  0.40673974  0.9803218
## SHARE  684 46.03451754 17.68437717  4.16000000 44.95500000 88.5800000
## ST     684  0.05263158  0.22346029  0.00000000  0.00000000  1.0000000
# 建立logistic回归模型
model.full <- glm(ST ~ ARA + ASSET + ATO + ROA + GROWTH + LEV + SHARE, data = logitdata, family = binomial(link = "logit"))
summary(model.full)
## 
## Call:
## glm(formula = ST ~ ARA + ASSET + ATO + ROA + GROWTH + LEV + SHARE, 
##     family = binomial(link = "logit"), data = logitdata)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -8.86924    4.63586  -1.913  0.05573 . 
## ARA          4.87974    1.49245   3.270  0.00108 **
## ASSET        0.24660    0.22409   1.100  0.27115   
## ATO         -0.50738    0.65744  -0.772  0.44026   
## ROA         -0.63661    6.22354  -0.102  0.91853   
## GROWTH      -0.83335    0.56706  -1.470  0.14167   
## LEV          2.35415    1.20138   1.960  0.05005 . 
## SHARE       -0.01111    0.01115  -0.997  0.31891   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 282.07  on 683  degrees of freedom
## Residual deviance: 251.51  on 676  degrees of freedom
## AIC: 267.51
## 
## Number of Fisher Scoring iterations: 6
1-pchisq(282.07-251.51,7)
## [1] 7.493111e-05
c(AIC(model.full), BIC(model.full))
## [1] 267.5057 303.7293
# AIC与BIC模型选择

model.aic <- step(model.full, trace = F)
summary(model.aic)
## 
## Call:
## glm(formula = ST ~ ARA + GROWTH + LEV, family = binomial(link = "logit"), 
##     data = logitdata)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.6022     0.5646  -8.152 3.58e-16 ***
## ARA           5.1301     1.4341   3.577 0.000347 ***
## GROWTH       -0.9061     0.5471  -1.656 0.097708 .  
## LEV           2.5501     1.0778   2.366 0.017986 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 282.07  on 683  degrees of freedom
## Residual deviance: 254.04  on 680  degrees of freedom
## AIC: 262.04
## 
## Number of Fisher Scoring iterations: 6
model.bic <- step(model.full, k = log(N[1]), trace = F)
summary(model.bic)
## 
## Call:
## glm(formula = ST ~ ARA, family = binomial(link = "logit"), data = logitdata)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.6834     0.2722 -13.531  < 2e-16 ***
## ARA           6.3316     1.3132   4.821 1.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 282.07  on 683  degrees of freedom
## Residual deviance: 261.71  on 682  degrees of freedom
## AIC: 265.71
## 
## Number of Fisher Scoring iterations: 6
# 模型比较AUC
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc.full <- roc(logitdata$ST, predict(model.full, type = "response"))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.aic <- roc(logitdata$ST, predict(model.aic, type = "response"))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.bic <- roc(logitdata$ST, predict(model.bic, type = "response"))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.full$auc
## Area under the curve: 0.77
roc.aic$auc
## Area under the curve: 0.7738
roc.bic$auc
## Area under the curve: 0.671
par(mfrow = c(1, 1))
plot(roc.full, col = "skyblue", lwd = 2, main = "ROC Curve")
plot(roc.aic, col = "pink", lwd = 2, add = TRUE)
plot(roc.bic, col = "lightgreen", lwd = 2, add = TRUE)
legend("bottomright", legend = c("Full Model", "AIC Model", "BIC Model"), col = c("skyblue", "pink", "lightgreen"), lwd = 2)

# 基于外样本的模型精度评估
# 重复100次,每次随机抽取80%的数据作为训练集,20%的数据作为测试集
nsimu <- 100
ss <- nrow(logitdata)
ss0 <- round(ss*0.8)
auc.full <- rep(0, nsimu)
AUC <- as.data.frame(matrix(0, nrow = nsimu, ncol = 3))
names(AUC) <- c("Full", "AIC", "BIC")
for (i in 1:nsimu) {
  index <- sample(1:ss, ss0)
  model.full <- glm(ST ~ ARA + ASSET + ATO + ROA + GROWTH + LEV + SHARE, data = logitdata[index,], family = binomial(link = "logit"))
  model.aic <- glm(ST ~ ARA + GROWTH + LEV, data = logitdata[index,], family = binomial(link = "logit"))
  model.bic <- glm(ST ~ ARA, data = logitdata[index,], family = binomial(link = "logit"))
  roc.full <- roc(logitdata$ST[-index], predict(model.full, logitdata[-index,], type = "response"), quiet = TRUE)
  roc.aic <- roc(logitdata$ST[-index], predict(model.aic, logitdata[-index,], type = "response"), quiet = TRUE)
  roc.bic <- roc(logitdata$ST[-index], predict(model.bic, logitdata[-index,], type = "response"), quiet = TRUE)
  AUC[i,] <- c(roc.full$auc, roc.aic$auc, roc.bic$auc)
}
boxplot(AUC, main = "外样本AUC对比", col = c("skyblue", "pink", "lightgreen"))

定序回归

# 读入数据
orderdata <- read.csv("ordered.csv", fileEncoding = "GBK")
head(orderdata)
##   gender     usage credit loan history accounts status
## 1   女性 0.9900680   3000    0       0        0      0
## 2   女性 1.3578084   3000    0       7        3      1
## 3   男性 1.2766882   5000    0       7        2      3
## 4   男性 0.7242874   3000 1745       0        0      0
## 5   男性 1.0739191   2000    0       0       11      0
## 6   女性 1.1730013   3000    0       0        2      3
orderdata$Y1 <- 1*(orderdata$status > 0)
orderdata$Y2 <- orderdata$status
head(orderdata)
##   gender     usage credit loan history accounts status Y1 Y2
## 1   女性 0.9900680   3000    0       0        0      0  0  0
## 2   女性 1.3578084   3000    0       7        3      1  1  1
## 3   男性 1.2766882   5000    0       7        2      3  1  3
## 4   男性 0.7242874   3000 1745       0        0      0  0  0
## 5   男性 1.0739191   2000    0       0       11      0  0  0
## 6   女性 1.1730013   3000    0       0        2      3  1  3
dim(orderdata)
## [1] 8000    9
mean(orderdata$Y1)
## [1] 0.61475
# 筛选出Y1=1的数据
orderdata1 <- orderdata[orderdata$Y1 == 1,]
head(orderdata1)
##    gender     usage credit loan history accounts status Y1 Y2
## 2    女性 1.3578084   3000    0       7        3      1  1  1
## 3    男性 1.2766882   5000    0       7        2      3  1  3
## 6    女性 1.1730013   3000    0       0        2      3  1  3
## 7    男性 1.0302797   5000    0       0        5      6  1  6
## 8    女性 0.9457060   4000    0       2        2      1  1  1
## 10   男性 0.6010362   3000    0       0        1      2  1  2
# 分析Y1=1时的不同类型
library(tidyverse)
table(orderdata1$Y2) %>% 
  barplot(main = "Y2分布", 
          xlab = "逾期严重程度", 
          ylab = "频数",
          las = 1,
          col = "skyblue")

# 合并部分类
orderdata$Y2 <- 4*(orderdata$Y2>4) + orderdata$Y2 *(orderdata$Y2 <= 4)
orderdata1 <- orderdata[orderdata$Y1 == 1,]
head(orderdata1)
##    gender     usage credit loan history accounts status Y1 Y2
## 2    女性 1.3578084   3000    0       7        3      1  1  1
## 3    男性 1.2766882   5000    0       7        2      3  1  3
## 6    女性 1.1730013   3000    0       0        2      3  1  3
## 7    男性 1.0302797   5000    0       0        5      6  1  4
## 8    女性 0.9457060   4000    0       2        2      1  1  1
## 10   男性 0.6010362   3000    0       0        1      2  1  2
table(orderdata1$Y2)
## 
##    1    2    3    4 
## 1189 2018 1110  601
# 对新的Y2重绘制图
table(orderdata1$Y2) %>% 
  barplot(main = "Y2分布", 
          xlab = "逾期严重程度", 
          ylab = "频数",
          las = 1,
          col = "skyblue")

# 性别分布
tab <- table(orderdata$gender)
tab
## 
## 女性 男性 
## 2544 5456
tab/sum(tab)
## 
##  女性  男性 
## 0.318 0.682
prop.table(tab)
## 
##  女性  男性 
## 0.318 0.682
# usage:信用卡相对使用率的分布
hist(orderdata$usage, 
     main = "信用卡相对使用率", 
     xlab = "usage", 
     ylab = "频数",
     las = 1,
     col = "skyblue")

orderdata[orderdata$usage > 4,]
##      gender     usage credit  loan history accounts status Y1 Y2
## 903    男性  6.074610  50000 17268       8        4      3  1  3
## 1333   男性  4.241007   3000 25571       2        4      4  1  4
## 2347   男性  4.683206  10000  6811       9        1      1  1  1
## 2357   女性  6.024418  12000 13799       2       18      1  1  1
## 3135   男性  5.095060   1000  1774       0        4      0  0  0
## 3722   男性  4.293418   1000     0       3       12      2  1  2
## 3805   男性  5.271930   1000     0       0        4      4  1  4
## 6148   女性 13.088921  10000  6550       0        3      3  1  3
## 7069   男性  4.597112   1000     0       0        4      0  0  0
# 定义正常使用率usage=1与非正常使用率usage=0
orderdata$usage <- 1*(orderdata$usage <= 1)
tab <- table(orderdata$usage)
tab
## 
##    0    1 
## 3329 4671
tab/sum(tab)
## 
##        0        1 
## 0.416125 0.583875
prop.table(tab)
## 
##        0        1 
## 0.416125 0.583875
# credit:信用额度的分布
hist(orderdata$credit, 
     main = "信用额度", 
     xlab = "credit", 
     ylab = "频数",
     las = 1,
     col = "skyblue")

# 对数变换
orderdata$credit <- log(orderdata$credit)
hist(orderdata$credit, 
     main = "对数信用额度", 
     xlab = "log(credit)", 
     ylab = "频数",
     las = 1,
     col = "skyblue")

# 定义两个与loan有关的两个解释变量,有无及金额
orderdata$Z1 <- 1*(orderdata$loan > 0)
orderdata$Z2 <- orderdata$loan
orderdata[orderdata$loan > 0,]$Z2 <- log(orderdata[orderdata$loan > 0,]$loan)
head(orderdata)
##   gender usage   credit loan history accounts status Y1 Y2 Z1      Z2
## 1   女性     1 8.006368    0       0        0      0  0  0  0 0.00000
## 2   女性     0 8.006368    0       7        3      1  1  1  0 0.00000
## 3   男性     0 8.517193    0       7        2      3  1  3  0 0.00000
## 4   男性     1 8.006368 1745       0        0      0  0  0  1 7.46451
## 5   男性     0 7.600902    0       0       11      0  0  0  0 0.00000
## 6   女性     0 8.006368    0       0        2      3  1  3  0 0.00000
# 房贷比例
mean(orderdata$Z1)
## [1] 0.169625
# 有房贷的金额分布
hist(orderdata[orderdata$Z1 == 1,]$Z2, 
     main = "有房贷者金额分布", 
     xlab = "对数房贷月供", 
     ylab = "频数",
     las = 1,
     col = "skyblue")

# history历史逾期次数的分布
hist(orderdata$history, 
     main = "历史逾期次数分布", 
     xlab = "历史逾期次数", 
     ylab = "频数",
     las = 1,
     col = "skyblue")

# account信用卡开卡数的分布
summary(orderdata$account)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   3.365   4.000  67.000
hist_data <- hist(orderdata$account, 
     main = "信用卡开卡数分布", 
     xlab = "信用卡开卡数", 
     ylab = "频数",
     ylim = c(0, 7000),
     las = 1,
     col = "skyblue")

# 添加频数值到图上
text(
  x = hist_data$mids,       # 每个柱子的中间位置
  y = hist_data$counts,     # 每个柱子的高度
  labels = hist_data$counts, # 频数值
  pos = 3,                  # 标签放在柱子顶部
  cex = 0.8,                # 调整字体大小
  col = "red"               # 设置字体颜色
)

# 提取数据并转换为数据框
hist_table <- data.frame(
  Bin_Start = hist_data$breaks[-length(hist_data$breaks)], # 每个区间的起点
  Bin_End = hist_data$breaks[-1],                         # 每个区间的终点
  Frequency = hist_data$counts                            # 每个区间的频数
)

# 显示表格
hist_table
##    Bin_Start Bin_End Frequency
## 1          0       5      6420
## 2          5      10       992
## 3         10      15       391
## 4         15      20       127
## 5         20      25        36
## 6         25      30        19
## 7         30      35         3
## 8         35      40         7
## 9         40      45         1
## 10        45      50         2
## 11        50      55         1
## 12        55      60         0
## 13        60      65         0
## 14        65      70         1
# 结合Y1对gender,usage,X1进行描述性统计
prop.table(table(orderdata$gender, orderdata$Y1), 1)
##       
##                0         1
##   女性 0.4465409 0.5534591
##   男性 0.3566716 0.6433284
prop.table(table(orderdata$usage, orderdata$Y1), 1)
##    
##             0         1
##   0 0.2820667 0.7179333
##   1 0.4587883 0.5412117
prop.table(table(orderdata$Z1, orderdata$Y1), 1)
##    
##             0         1
##   0 0.3605299 0.6394701
##   1 0.5062638 0.4937362
mu1 <- tapply(orderdata$Y1, orderdata$gender, mean)
mu1
##      女性      男性 
## 0.5534591 0.6433284
mu2 <- tapply(orderdata$Y1, orderdata$usage, mean)
mu2
##         0         1 
## 0.7179333 0.5412117
mu3 <- tapply(orderdata$Y1, orderdata$Z1, mean)
mu3
##         0         1 
## 0.6394701 0.4937362
# Y1与连续型X作箱式图
par(mfrow = c(1, 3))
orderdata %>% 
  boxplot(credit ~ Y1,
          .,
        xlab = "是否逾期", 
        ylab = "对数授信额度",
        names = c("非逾期", "逾期"),
        col = c("skyblue", "pink"))
orderdata %>% 
  boxplot(history ~ Y1,
        .,
        xlab = "是否逾期", 
        ylab = "历史逾期次数",
        names = c("非逾期", "逾期"),
        col = c("skyblue", "pink"))
orderdata %>% 
  boxplot(accounts ~ Y1,
        .,
        xlab = "是否逾期", 
        ylab = "信用卡开卡数",
        names = c("非逾期", "逾期"),
        col = c("skyblue", "pink"))

# 有房贷的月供的对数对Y1做箱式图
par(mfrow = c(1, 1))
orderdata1 <- orderdata[orderdata$Z1 == 1,]
boxplot(Z2 ~ Y1,
        orderdata1,
        main = "有房贷者月供分布", 
        xlab = "是否逾期", 
        ylab = "对数房贷月供",
        names = c("非逾期", "逾期"),
        col = c("skyblue", "pink"))

# 结合Y2对gender,usage,X1进行描述性统计
aa <- orderdata[orderdata$Y1 == 1,]
aa$sex <- 1*(aa$gender == "男性")
dim(aa)
## [1] 4918   12
head(aa)
##    gender usage   credit loan history accounts status Y1 Y2 Z1 Z2 sex
## 2    女性     0 8.006368    0       7        3      1  1  1  0  0   0
## 3    男性     0 8.517193    0       7        2      3  1  3  0  0   1
## 6    女性     0 8.006368    0       0        2      3  1  3  0  0   0
## 7    男性     0 8.517193    0       0        5      6  1  4  0  0   1
## 8    女性     1 8.294050    0       2        2      1  1  1  0  0   0
## 10   男性     1 8.006368    0       0        1      2  1  2  0  0   1
par(mfrow = c(1,3))
plot(tapply(aa$sex, aa$Y2, mean), 
     type = "b", 
     xlab = "逾期严重程度", 
     ylab = "男性占比",
     ylim = c(0, 1)
     )
plot(tapply(aa$usage, aa$Y2, mean), 
     type = "b", 
     xlab = "逾期严重程度", 
     ylab = "正常使用占比",
     ylim = c(0, 1)
     )
plot(tapply(aa$Z1,aa$Y2, mean), 
     type = "b", 
     xlab = "逾期严重程度", 
     ylab = "房贷比例",
     ylim = c(0, 1)
     )

# Y2与连续型X(对数授信额度、历史逾期次数、信用卡开户数、对数月供)作箱式图

par(mfrow = c(2, 2))
boxplot(credit ~ Y2,
        aa,
        xlab = "逾期严重程度", 
        ylab = "对数授信额度",
        col = c("skyblue", "pink"))
boxplot(history ~ Y2,
        aa,
        xlab = "逾期严重程度", 
        ylab = "历史逾期次数",
        col = c("skyblue", "pink"))
boxplot(accounts ~ Y2,
        aa,
        xlab = "逾期严重程度", 
        ylab = "信用卡开户数",
        col = c("skyblue", "pink"))
aa1 <- aa[aa$loan > 0,]
boxplot(Z2 ~ Y2,
        aa1,
        xlab = "逾期严重程度", 
        ylab = "对数月供",
        col = c("skyblue", "pink"))

# 对是否逾期建立0-1回归分析
orderdata$gender <- factor(orderdata$gender, levels = c("男性", "女性"))
# orderdata$gender <- relevel(orderdata$gender, ref = "男性")
model.full <- glm(Y1 ~ gender + usage + credit + Z1 + Z2 + history + accounts , data = orderdata, family = binomial(link = "logit"))
summary(model.full)
## 
## Call:
## glm(formula = Y1 ~ gender + usage + credit + Z1 + Z2 + history + 
##     accounts, family = binomial(link = "logit"), data = orderdata)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.369341   0.385580   8.738  < 2e-16 ***
## gender女性  -0.261012   0.054073  -4.827 1.39e-06 ***
## usage       -0.360500   0.056459  -6.385 1.71e-10 ***
## credit      -0.378850   0.045810  -8.270  < 2e-16 ***
## Z1           0.401117   0.526887   0.761   0.4465    
## Z2          -0.101542   0.066921  -1.517   0.1292    
## history      0.605662   0.023140  26.174  < 2e-16 ***
## accounts     0.010548   0.006208   1.699   0.0893 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10665.2  on 7999  degrees of freedom
## Residual deviance:  8924.5  on 7992  degrees of freedom
## AIC: 8940.5
## 
## Number of Fisher Scoring iterations: 5
1-pchisq(10665.2-8924.5, 7999-7992)
## [1] 0
# 自动寻找最优的AIC模型
model.aic <- step(model.full, trace = F)
summary(model.aic)
## 
## Call:
## glm(formula = Y1 ~ gender + usage + credit + Z2 + history + accounts, 
##     family = binomial(link = "logit"), data = orderdata)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.403849   0.382845   8.891  < 2e-16 ***
## gender女性  -0.261165   0.054072  -4.830 1.37e-06 ***
## usage       -0.360641   0.056457  -6.388 1.68e-10 ***
## credit      -0.382747   0.045514  -8.409  < 2e-16 ***
## Z2          -0.051068   0.008937  -5.714 1.10e-08 ***
## history      0.605350   0.023130  26.172  < 2e-16 ***
## accounts     0.010323   0.006200   1.665   0.0959 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10665.2  on 7999  degrees of freedom
## Residual deviance:  8925.1  on 7993  degrees of freedom
## AIC: 8939.1
## 
## Number of Fisher Scoring iterations: 5
1-pchisq(10665.2-8925.1, 7999-7993)
## [1] 0
# 自动寻找最优的BIC模型
ss = length(orderdata$Y1)
model.bic <- step(model.full, k = log(ss), trace = F)
summary(model.bic)
## 
## Call:
## glm(formula = Y1 ~ gender + usage + credit + Z2 + history, family = binomial(link = "logit"), 
##     data = orderdata)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.528699   0.375333   9.402  < 2e-16 ***
## gender女性  -0.264004   0.054045  -4.885 1.03e-06 ***
## usage       -0.390036   0.053674  -7.267 3.68e-13 ***
## credit      -0.391475   0.045190  -8.663  < 2e-16 ***
## Z2          -0.048709   0.008818  -5.524 3.32e-08 ***
## history      0.603212   0.023069  26.148  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10665.2  on 7999  degrees of freedom
## Residual deviance:  8927.9  on 7994  degrees of freedom
## AIC: 8939.9
## 
## Number of Fisher Scoring iterations: 5
1-pchisq(10665.2-8927.9, 7999-7994)
## [1] 0
# 对3个模型做内样本预测
pred.full <- predict(model.full, type = "response")
pred.aic <- predict(model.aic, type = "response")
pred.bic <- predict(model.bic, type = "response")

# 计算AUC
library(pROC)
roc.full <- roc(orderdata$Y1, pred.full)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.aic <- roc(orderdata$Y1, pred.aic)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.bic <- roc(orderdata$Y1, pred.bic)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(c(roc.full$auc, roc.aic$auc, roc.bic$auc))
## [1] 0.7677270 0.7676767 0.7669658
# 画ROC曲线
par(mfrow = c(1, 1))
plot(roc.full, main = "全模型")

plot(roc.aic, main = "AIC模型")

plot(roc.bic, main = "BIC模型")

# 在有逾期的样本上建立关于逾期严重程度的定序回归模型
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
aa$gender <- factor(aa$gender, levels = c("男性", "女性"))
probit.full <- polr(as.factor(Y2) ~ gender + usage + credit + Z1 + Z2 + history + accounts, method = "probit", data = aa, Hess = TRUE)
summary(probit.full)
## Call:
## polr(formula = as.factor(Y2) ~ gender + usage + credit + Z1 + 
##     Z2 + history + accounts, data = aa, Hess = TRUE, method = "probit")
## 
## Coefficients:
##               Value Std. Error t value
## gender女性 -0.06715   0.034037  -1.973
## usage      -0.02486   0.032584  -0.763
## credit     -0.10441   0.029553  -3.533
## Z1          0.44756   0.365270   1.225
## Z2         -0.06693   0.046794  -1.430
## history     0.05429   0.005450   9.962
## accounts    0.02114   0.003504   6.032
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -1.4285  0.2461    -5.8037
## 2|3 -0.3110  0.2456    -1.2660
## 3|4  0.4829  0.2459     1.9634
## 
## Residual Deviance: 12612.04 
## AIC: 12632.04
# Y2有3个不同的截距项
tab <- as.data.frame(coefficients(summary(probit.full)))
tab$p.value <- round(2*(1-pnorm(abs(tab$`t value`))), 3)
tab
##                  Value  Std. Error   t value p.value
## gender女性 -0.06714918 0.034037304 -1.972811   0.049
## usage      -0.02486039 0.032584002 -0.762963   0.445
## credit     -0.10440799 0.029553035 -3.532902   0.000
## Z1          0.44755705 0.365270145  1.225277   0.220
## Z2         -0.06692980 0.046793651 -1.430318   0.153
## history     0.05429390 0.005450082  9.962034   0.000
## accounts    0.02113707 0.003504275  6.031795   0.000
## 1|2        -1.42853910 0.246144151 -5.803669   0.000
## 2|3        -0.31095087 0.245608760 -1.266041   0.205
## 3|4         0.48289766 0.245944751  1.963440   0.050
# 建立空模型null model,只包含截距项
probit.null <- polr(as.factor(Y2) ~ 1, method = "probit", data = aa, Hess = TRUE)
summary(probit.null)
## Call:
## polr(formula = as.factor(Y2) ~ 1, data = aa, Hess = TRUE, method = "probit")
## 
## No coefficients
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -0.7007   0.0196   -35.8202
## 2|3   0.3910   0.0184    21.2785
## 3|4   1.1641   0.0230    50.5037
## 
## Residual Deviance: 12802.75 
## AIC: 12808.75
# 全模型整体显著性检验
# 全模型与空模型的比较
1-pchisq(12802.75 - 12612.04, 7997 - 7990)
## [1] 0
# AIC模型
probit.aic <- step(probit.full, trace = F)
summary(probit.aic)
## Call:
## polr(formula = as.factor(Y2) ~ gender + credit + Z2 + history + 
##     accounts, data = aa, Hess = TRUE, method = "probit")
## 
## Coefficients:
##               Value Std. Error t value
## gender女性 -0.06797   0.034029  -1.998
## credit     -0.11018   0.029279  -3.763
## Z2         -0.01018   0.005985  -1.701
## history     0.05495   0.005364  10.244
## accounts    0.02171   0.003374   6.436
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -1.4607  0.2443    -5.9783
## 2|3 -0.3434  0.2438    -1.4087
## 3|4  0.4501  0.2441     1.8440
## 
## Residual Deviance: 12614.12 
## AIC: 12630.12
tab <- as.data.frame(coefficients(summary(probit.aic)))
tab$p.value <- round(2*(1-pnorm(abs(tab$`t value`))), 3)
tab
##                  Value  Std. Error   t value p.value
## gender女性 -0.06797396 0.034028753 -1.997545   0.046
## credit     -0.11018141 0.029278503 -3.763219   0.000
## Z2         -0.01017734 0.005984540 -1.700605   0.089
## history     0.05494772 0.005363807 10.244163   0.000
## accounts    0.02171399 0.003373886  6.435898   0.000
## 1|2        -1.46067265 0.244330975 -5.978254   0.000
## 2|3        -0.34339596 0.243768536 -1.408697   0.159
## 3|4         0.45012790 0.244097940  1.844046   0.065
1-pchisq(12802.75 - 12614.12, 7997 - 7992)
## [1] 0
## BIC模型
ss2 <- length(aa$Y2)
probit.bic <- step(probit.full, k = log(ss2), trace = F)
summary(probit.bic)
## Call:
## polr(formula = as.factor(Y2) ~ credit + history + accounts, data = aa, 
##     Hess = TRUE, method = "probit")
## 
## Coefficients:
##             Value Std. Error t value
## credit   -0.12271   0.028166  -4.357
## history   0.05542   0.005358  10.345
## accounts  0.02112   0.003334   6.334
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -1.5344  0.2361    -6.5001
## 2|3 -0.4181  0.2354    -1.7760
## 3|4  0.3752  0.2358     1.5914
## 
## Residual Deviance: 12621.05 
## AIC: 12633.05
tab <- as.data.frame(coefficients(summary(probit.bic)))
tab$p.value <- round(2*(1-pnorm(abs(tab$`t value`))), 3)
tab
##                Value  Std. Error   t value p.value
## credit   -0.12270807 0.028166195 -4.356572   0.000
## history   0.05542479 0.005357707 10.344871   0.000
## accounts  0.02111930 0.003334170  6.334198   0.000
## 1|2      -1.53438168 0.236054863 -6.500106   0.000
## 2|3      -0.41812594 0.235431197 -1.776001   0.076
## 3|4       0.37517650 0.235753519  1.591393   0.112
1-pchisq(12802.75 - 12621.05, 7997 - 7994)
## [1] 0
# 分别以Y1和Y2为因变量的BIC模型进行预测
new <- data.frame(gender = "男性", usage = 1, credit = log(20000), loan = 4000,  Z1 = 1, Z2 = log(4000), history = 4, accounts = 3)
pred <- predict(model.bic, new)
exp(pred)/(1+exp(pred))
##         1 
## 0.7808363
predict(probit.bic,new)
## [1] 2
## Levels: 1 2 3 4

Poisson回归

poisson <- read.csv("poisson.csv", fileEncoding = "GBK")
dim(poisson)
## [1] 80000     6
head(poisson)
##            keyword impression click cost ranking conversion
## 1 飞机票价格怎么算         55     4 1.89    3.39          0
## 2   武汉特价飞机票        198     5 3.76    7.72          0
## 3     广州机票特价          6     1 0.98    2.00          0
## 4       昆明飞机票        205     1 0.63   10.56          0
## 5       北京机票网         21     1 0.89    4.82          0
## 6         武汉飞机        466     4 2.66    6.53          0
# 计算关键词长度
poisson$kwlen <- nchar(poisson$keyword)
poisson[1:5,]
##            keyword impression click cost ranking conversion kwlen
## 1 飞机票价格怎么算         55     4 1.89    3.39          0     8
## 2   武汉特价飞机票        198     5 3.76    7.72          0     7
## 3     广州机票特价          6     1 0.98    2.00          0     6
## 4       昆明飞机票        205     1 0.63   10.56          0     5
## 5       北京机票网         21     1 0.89    4.82          0     5
# 对关键词进行分词
# install.packages("jiebaR")
library(jiebaR)
## Loading required package: jiebaRD
## 
## Attaching package: 'jiebaR'
## The following object is masked from 'package:psych':
## 
##     distance
my.worker <- worker()
kw.seg <- segment(poisson$keyword, my.worker)

tab <- table(kw.seg)
tab <- sort(tab, decreasing = T)

tab2 <- tab[1:100]
tab2 <- tab2/sum(tab)
tab2
## kw.seg
##        机票      飞机票    特价机票        查询        便宜        预订 
## 0.143109670 0.076686612 0.049180103 0.047225140 0.033362262 0.027684648 
##        打折        深圳          的        飞机        航班        特价 
## 0.024857260 0.023742749 0.023249440 0.022678482 0.021225963 0.021157448 
##        上海        广州      订机票        三亚        北京        网站 
## 0.018448819 0.017101357 0.016886676 0.015826977 0.015699082 0.013977070 
##      时刻表          网        厦门        预定          最        网上 
## 0.013209702 0.013205134 0.011122277 0.011021788 0.010359476 0.009975791 
##          买        南京        昆明        折扣    机票价格      哈尔滨 
## 0.009879870 0.008144156 0.008112182 0.007952314 0.007646280 0.007427031 
##        长沙        天津        武汉        海口        票价      机票网 
## 0.007239757 0.007189513 0.007157539 0.007006806 0.006065866 0.005988215 
##        订购        重庆        大连    北京机票          去        订票 
## 0.005988215 0.005709588 0.005577125 0.005129493 0.005033572 0.004946787 
##        哪个    乌鲁木齐        杭州        成都        价格        春节 
## 0.004873704 0.004846298 0.004663591 0.004636185 0.004426072 0.004416937 
##        西安        低价        郑州      多少钱        济南          订 
## 0.004380396 0.004266204 0.004238798 0.004197689 0.003964738 0.003777463 
##          定    呼和浩特        什么        哪里        长春        南昌 
## 0.003676974 0.003640433 0.003539944 0.003521674 0.003444023 0.003275019 
##        官网        廉价    往返机票    优惠机票        哪儿        烟台 
## 0.003192801 0.003165395 0.003128854 0.003101448 0.002795414 0.002749737 
##        五一        南宁        怎么        沈阳        太原          到 
## 0.002640113 0.002640113 0.002571598 0.002283835 0.002256429 0.002215320 
##    成都机票          吗    西双版纳        珠海        十一        最低 
## 0.001936692 0.001904718 0.001881880 0.001840771 0.001763121 0.001758553 
##        2013        时候        青岛        团购    国内机票      途牛网 
## 0.001731147 0.001658064 0.001616955 0.001557575 0.001539305 0.001521034 
##        特惠        航空        途牛        丽江          年      携程网 
## 0.001466222 0.001333760 0.001310921 0.001306354 0.001274380 0.001269812 
##    西安机票        哪网        电话          好        那个        儿童 
## 0.001228703 0.001224136 0.001219568 0.001215000 0.001146485 0.001119079 
##        往返      头等舱        民航        旅游 
## 0.001109944 0.001096241 0.001082538 0.001068835
sum(tab2)
## [1] 0.9498972
# 生成新的解释变量,4水平的定性因子型变量ticket
ss <- length(poisson$keyword)
ticket <- rep("其他", ss)
kw <- poisson$keyword

pos <- grep("飞机票", kw)
ticket[pos] <- "飞机票"
kw <- gsub("飞机票", "-", kw)

pos <- grep("机票", kw)
ticket[pos] <- "机票"
kw <- gsub("机票", "-", kw)

pos <- grep("航班", kw)
ticket[pos] <- "航班"
kw <- gsub("航班", "-", kw)

head(ticket)
## [1] "飞机票" "飞机票" "机票"   "飞机票" "机票"   "其他"
class(ticket)
## [1] "character"
table(factor(ticket,levels = c("飞机票",  "航班","机票", "其他")))
## 
## 飞机票   航班   机票   其他 
##  17658   4647  52401   5294
poisson$ticket <- ticket

# 生成新的解释变量,9水平的定性因子型变量price
kw.pattern <- c("特价", "便宜", "打折", "折扣", "低价", "廉价", "特惠", "最")
tmp <- rep("其他", ss)
kw <- poisson$keyword
for (i in 1:length(kw.pattern)) {
  pos <- grep(kw.pattern[i], kw)
  tmp[pos] <- kw.pattern[i]
  kw <- gsub(kw.pattern[i], "-", kw)
}
tmp <- factor(tmp)
poisson$price <- tmp
table(tmp)
## tmp
##  低价  便宜  其他    最  廉价  打折  折扣  特价  特惠 
##   937  5042 47967  2762   693  5441  1460 15377   321
# 生成新的解释变量,10水平的定性因素型变量buy购买意愿
kw.pattern <- c("定", "订", "购", "买", "查询", "多少钱", "价格", "哪个", "时刻表")
tmp <- rep("其他", ss)
kw <- poisson$keyword
for (i in 1:length(kw.pattern)) {
  pos <- grep(kw.pattern[i], kw)
  tmp[pos] <- kw.pattern[i]
  kw <- gsub(kw.pattern[i], "-", kw)
}
poisson$buy <- tmp
freq_table <- table(tmp)
sorted_table <- freq_table[order(stringi::stri_trans_general(names(freq_table), "Han-Latin"))]
sorted_table
## tmp
##   查询     定     订 多少钱     购   价格     买   哪个   其他 时刻表 
##   8954   3229  11066    919   2016   2643   2295   1067  44919   2892
# 生成新的解释变量,5水平的定性因素型变量city1一线城市
kw.pattern <- c("北京", "上海", "广州", "深圳")
tmp <- rep("其他", ss)
kw <- poisson$keyword
for (i in 1:length(kw.pattern)) {
  pos <- grep(kw.pattern[i], kw)
  tmp[pos] <- kw.pattern[i]
  kw <- gsub(kw.pattern[i], "-", kw)
}
poisson$city1 <- tmp
freq_table <- table(tmp)
sorted_table <- freq_table[order(stringi::stri_trans_general(names(freq_table), "Han-Latin"))]
sorted_table
## tmp
##  北京  广州  其他  上海  深圳 
##  4543  3753 62476  4030  5198
# 生成新的解释变量,21水平的定性因素型变量city2热点城市
kw.pattern=c("三亚","厦门","南京","昆明","哈尔滨","天津",
             "长沙","武汉","海口","重庆","大连","乌鲁木齐" ,"杭州",
             "成都","西安","郑州","济南","呼和浩特","长春","南昌")
tmp <- rep("其他", ss)
for (i in 1:length(kw.pattern)) {
  pos <- grep(kw.pattern[i], kw)
  tmp[pos] <- kw.pattern[i]
  kw <- gsub(kw.pattern[i], "-", kw)
}
poisson$city2 <- tmp
freq_table <- table(tmp)
sorted_table <- freq_table[order(stringi::stri_trans_general(names(freq_table), "Han-Latin"))]
sorted_table
## tmp
##     成都     重庆     大连   哈尔滨     海口     杭州 呼和浩特     济南 
##     1439     1250     1221     1626     1532     1021      797      868 
##     昆明     南昌     南京     其他     三亚     厦门     天津     武汉 
##     1776      717     1783    51337     3465     2471     1574     1567 
## 乌鲁木齐     西安     长春     长沙     郑州 
##     1061     1228      754     1585      928
# 生成新的解释变量,7水平的定性因素型变量brand广告主品牌
kw.pattern=c("携程","途牛","去哪儿","艺龙","同程","酷讯")
tmp <- rep("其他", ss)
for (i in 1:length(kw.pattern)) {
  pos <- grep(kw.pattern[i], kw)
  tmp[pos] <- kw.pattern[i]
  kw <- gsub(kw.pattern[i], "-", kw)
}
poisson$brand <- tmp
freq_table <- table(tmp)
sorted_table <- freq_table[order(stringi::stri_trans_general(names(freq_table), "Han-Latin"))]
sorted_table
## tmp
##   酷讯   其他 去哪儿   同程   途牛   携程   艺龙 
##    149  77939    601    129    620    436    126
# 定义两个因变量Y1(有无转化)和Y2(转化次数)
poisson$Y1 <- 1*(poisson$conversion > 0)
poisson$Y2 <- poisson$Y1 *(poisson$conversion - 1 )
poisson$CTR <- poisson$click/poisson$impression * 100
poisson$CPC <- poisson$cost/poisson$click

# 有转化数据中Y2的直方图
aa <- poisson[poisson$Y1 == 1,]
hist(aa$conversion , 
     xlab = "转化程度", 
     ylab = "频数",
     col = "skyblue")

# 提取转化量过高的关键词Y2>60
# 检验是否有异常值
aa[aa$Y2 > 60,]
##            keyword impression click    cost ranking conversion kwlen ticket
## 6739      机票预订      32771  1302 1916.41    8.09         95     4   机票
## 8148  网上预订机票       1537   206  164.18    7.67         64     6   机票
## 11543         机票      61480  1277 2100.01    3.95         62     2   机票
## 29079     机票预订      35349  1368 2017.65    8.43         92     4   机票
## 36229 网上预订机票       1293   409  341.06    5.51         70     6   机票
## 39722 网上预订机票       1499   218  172.30    6.94         62     6   机票
## 50506 网上预订机票        911   176  141.27    5.70         69     6   机票
## 75322     机票预订      92999  1107 1252.46    7.81         75     4   机票
## 75459     机票预订      31136  1105 1632.17    9.03         92     4   机票
##       price  buy city1 city2 brand Y1 Y2       CTR       CPC
## 6739   其他   订  其他  其他  其他  1 94  3.973025 1.4718971
## 8148   其他   订  其他  其他  其他  1 63 13.402733 0.7969903
## 11543  其他 其他  其他  其他  其他  1 61  2.077098 1.6444871
## 29079  其他   订  其他  其他  其他  1 91  3.869982 1.4748904
## 36229  其他   订  其他  其他  其他  1 69 31.631864 0.8338875
## 39722  其他   订  其他  其他  其他  1 61 14.543029 0.7903670
## 50506  其他   订  其他  其他  其他  1 68 19.319429 0.8026705
## 75322  其他   订  其他  其他  其他  1 74  1.190335 1.1314002
## 75459  其他   订  其他  其他  其他  1 91  3.548947 1.4770769
# 对展现量、点击率及单位点击成本的对数做直方图
par(mfrow = c(1, 3))
hist(log(poisson$impression), 
     xlab = "展现量", 
     ylab = "频数",
     col = "skyblue")
hist(log(poisson$CTR), 
     xlab = "点击率", 
     ylab = "频数",
     col = "skyblue")
hist(log(poisson$CPC), 
     xlab = "单位点击成本", 
     ylab = "频数",
     col = "skyblue")

# 对关健词排名及关键词长度做直方图
par(mfrow = c(1, 2))
hist(poisson$ranking, 
     xlab = "排名", 
     ylab = "频数",
     col = "skyblue")
hist(poisson$kwlen,
     xlab = "关键词长度",
     ylab = "频数",
     col = "skyblue")

# 对6个离散型变量做柱状图
# 机票表达方式、价格偏好表达方式、购买意愿表达方式、一线城市、热点城市、广告主品牌
par(mfrow = c(3, 2))
library(tidyverse)

tab = table(poisson$ticket)
tab
## 
##   其他   机票   航班 飞机票 
##   5294  52401   4647  17658
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>% 
  barplot(
          las = 1,
          col = "skyblue")

tab = table(poisson$price)
tab
## 
##  低价  便宜  其他    最  廉价  打折  折扣  特价  特惠 
##   937  5042 47967  2762   693  5441  1460 15377   321
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>% 
  barplot(
          las = 1,
          col = "skyblue")

tab = table(poisson$buy)
tab
## 
##     买   价格   其他   哪个 多少钱     定 时刻表   查询     订     购 
##   2295   2643  44919   1067    919   3229   2892   8954  11066   2016
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>% 
  barplot(
          las = 1,
          col = "skyblue")

tab = table(poisson$city1)
tab
## 
##  上海  其他  北京  广州  深圳 
##  4030 62476  4543  3753  5198
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>% 
  barplot(
          las = 1,
          col = "skyblue")

tab = table(poisson$city2)
tab
## 
##     三亚 乌鲁木齐     其他     南京     南昌     厦门 呼和浩特   哈尔滨 
##     3465     1061    51337     1783      717     2471      797     1626 
##     大连     天津     成都     昆明     杭州     武汉     济南     海口 
##     1221     1574     1439     1776     1021     1567      868     1532 
##     西安     郑州     重庆     长春     长沙 
##     1228      928     1250      754     1585
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>% 
  barplot(
          las = 1,
          col = "skyblue")

tab = table(poisson$brand)
tab
## 
##   其他 去哪儿   同程   携程   艺龙   途牛   酷讯 
##  77939    601    129    436    126    620    149
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>% 
  barplot(
          las = 1,
          col = "skyblue")

# 结合Y1,对3个连续型解释变量做箱式图
par(mfrow = c(1, 3))
boxplot(log(impression) ~ Y1, 
        poisson, 
        xlab = "是否转化", 
        ylab = "对数展现量",
        names = c("未转化", "转化"),
        col = c("skyblue", "pink"))

boxplot(log(CTR) ~ Y1, 
        poisson, 
        xlab = "是否转化", 
        ylab = "对数点击率",
        names = c("未转化", "转化"),
        col = c("skyblue", "pink"))

boxplot(log(CPC) ~ Y1, 
        poisson, 
        xlab = "是否转化", 
        ylab = "对数单位点击成本",
        names = c("未转化", "转化"),
        col = c("skyblue", "pink"))

# Y1对排名、关键词长度的描述统计
par(mfrow = c(1, 2))
boxplot(ranking ~ Y1, 
        poisson, 
        xlab = "是否转化", 
        ylab = "排名",
        names = c("未转化", "转化"),
        col = c("skyblue", "pink"))
boxplot(poisson$kwlen ~ Y1, 
        poisson, 
        xlab = "是否转化", 
        ylab = "关键词长度",
        names = c("未转化", "转化"),
        col = c("skyblue", "pink"))

# Y1 对6个离散型变量的描述统计,并作柱状图
par(mfrow = c(3, 2))

mu <- tapply(poisson$Y1, poisson$ticket, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)

mu <- tapply(poisson$Y1, poisson$price, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)

mu <- tapply(poisson$Y1, poisson$buy, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)

mu <- tapply(poisson$Y1, poisson$city1, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)

mu <- tapply(poisson$Y1, poisson$city2, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)

mu <- tapply(poisson$Y1, poisson$brand, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)

# 结合Y2(转化量-1)对3个连续型解释变量做柱形图
par(mfrow = c(1, 3))

Y2 <- aa$Y2

Z <- aa$impression
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("低", "高"))

mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "展现量", ylab = "转化程度", ylim = c(0, 5))


Z <- aa$CTR
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("低", "高"))

mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "点击率", ylab = "转化程度", ylim = c(0, 5))

Z <- aa$CPC
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("低", "高"))

mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "单位点击成本", ylab = "转化程度", ylim = c(0, 5))

# Y2 对关键词排名、长度进行描述
par(mfrow = c(1,2))

Z <- aa$ranking
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("前", "后"))

mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "排名", ylab = "转化程度", ylim = c(0, 5))

Z <- aa$kwlen
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("短", "长"))

mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "关键词长度", ylab = "转化程度", ylim = c(0, 5))

# Y2 对6个离散型变量描述
par(mfrow = c(3,2))

mu <- tapply(aa$Y2, aa$ticket, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))

mu <- tapply(aa$Y2, aa$price, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))

mu <- tapply(aa$Y2, aa$buy, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))

mu <- tapply(aa$Y2, aa$city1, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))

mu <- tapply(aa$Y2, aa$city2, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))

mu <- tapply(aa$Y2, aa$brand, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))

# Y1为因变量的回归模型

logit.full <- glm(Y1 ~ log(impression) + CTR + CPC + ranking + kwlen + ticket + price + buy + city1 + city2 + brand,
                  family = binomial(link = logit), poisson)

summary(logit.full)
## 
## Call:
## glm(formula = Y1 ~ log(impression) + CTR + CPC + ranking + kwlen + 
##     ticket + price + buy + city1 + city2 + brand, family = binomial(link = logit), 
##     data = poisson)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -6.731785   0.281812 -23.888  < 2e-16 ***
## log(impression)  0.916767   0.011290  81.201  < 2e-16 ***
## CTR              0.053361   0.001707  31.261  < 2e-16 ***
## CPC              0.536719   0.065825   8.154 3.53e-16 ***
## ranking         -0.130826   0.008820 -14.833  < 2e-16 ***
## kwlen           -0.174399   0.015342 -11.367  < 2e-16 ***
## ticket机票       1.958456   0.104204  18.794  < 2e-16 ***
## ticket航班       0.220479   0.177344   1.243 0.213784    
## ticket飞机票     2.030838   0.106956  18.988  < 2e-16 ***
## price便宜       -0.609028   0.139948  -4.352 1.35e-05 ***
## price其他       -1.121852   0.128867  -8.706  < 2e-16 ***
## price最         -0.219658   0.142871  -1.537 0.124183    
## price廉价       -0.223694   0.170354  -1.313 0.189144    
## price打折       -0.476872   0.133062  -3.584 0.000339 ***
## price折扣       -0.852360   0.173582  -4.910 9.09e-07 ***
## price特价       -0.565037   0.126529  -4.466 7.98e-06 ***
## price特惠       -0.302526   0.205536  -1.472 0.141052    
## buy价格         -0.628473   0.120616  -5.211 1.88e-07 ***
## buy其他         -0.733573   0.091147  -8.048 8.40e-16 ***
## buy哪个          0.443984   0.159645   2.781 0.005418 ** 
## buy多少钱       -1.353116   0.420447  -3.218 0.001290 ** 
## buy定           -0.085912   0.106758  -0.805 0.420972    
## buy时刻表        0.077781   0.230507   0.337 0.735790    
## buy查询         -0.583232   0.099818  -5.843 5.13e-09 ***
## buy订            0.197138   0.090878   2.169 0.030062 *  
## buy购            0.137304   0.110483   1.243 0.213958    
## city1其他        0.291614   0.072800   4.006 6.18e-05 ***
## city1北京        0.142993   0.097648   1.464 0.143094    
## city1广州        0.042420   0.102225   0.415 0.678170    
## city1深圳        0.512921   0.085438   6.003 1.93e-09 ***
## city2乌鲁木齐    0.546353   0.192905   2.832 0.004622 ** 
## city2其他        0.486703   0.095364   5.104 3.33e-07 ***
## city2南京        0.367494   0.144745   2.539 0.011120 *  
## city2南昌        0.368943   0.234905   1.571 0.116275    
## city2厦门        0.018269   0.137657   0.133 0.894418    
## city2呼和浩特    0.462779   0.249161   1.857 0.063261 .  
## city2哈尔滨      0.161691   0.176643   0.915 0.360007    
## city2大连        0.161344   0.220406   0.732 0.464149    
## city2天津       -0.095125   0.202936  -0.469 0.639252    
## city2成都       -0.206132   0.200613  -1.028 0.304181    
## city2昆明        0.043972   0.168245   0.261 0.793818    
## city2杭州       -0.282414   0.245188  -1.152 0.249391    
## city2武汉        0.248119   0.180688   1.373 0.169693    
## city2济南       -0.044433   0.287671  -0.154 0.877248    
## city2海口        0.508701   0.177766   2.862 0.004215 ** 
## city2西安        0.162076   0.187977   0.862 0.388571    
## city2郑州        0.401305   0.222507   1.804 0.071300 .  
## city2重庆        0.081108   0.199626   0.406 0.684520    
## city2长春       -0.065571   0.280872  -0.233 0.815408    
## city2长沙        0.633972   0.147296   4.304 1.68e-05 ***
## brand去哪儿     -2.053580   0.210842  -9.740  < 2e-16 ***
## brand同程        1.144785   0.464446   2.465 0.013707 *  
## brand携程       -0.187904   0.172023  -1.092 0.274695    
## brand艺龙       -0.238677   0.470885  -0.507 0.612247    
## brand途牛        1.022618   0.125861   8.125 4.48e-16 ***
## brand酷讯       -1.756141   0.340073  -5.164 2.42e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 53405  on 79999  degrees of freedom
## Residual deviance: 33920  on 79944  degrees of freedom
## AIC: 34032
## 
## Number of Fisher Scoring iterations: 7